1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <assert.h> 30 #include <errno.h> 31 #include <stdlib.h> 32 #include <signal.h> 33 #include <unistd.h> 34 #include <ucontext.h> 35 #include <thread.h> 36 #include <strings.h> 37 #include <libintl.h> 38 #include <sys/regset.h> 39 #include <sys/syscall.h> 40 #include <sys/inttypes.h> 41 #include <sys/param.h> 42 #include <sys/types.h> 43 #include <sys/segments.h> 44 #include <signal.h> 45 #include <sys/lx_misc.h> 46 #include <sys/lx_types.h> 47 #include <sys/lx_signal.h> 48 #include <sys/lx_syscall.h> 49 #include <sys/lx_brand.h> 50 #include <sys/lx_debug.h> 51 #include <sys/lx_thread.h> 52 53 #define LX_CSIGNAL 0x000000ff 54 #define LX_CLONE_VM 0x00000100 55 #define LX_CLONE_FS 0x00000200 56 #define LX_CLONE_FILES 0x00000400 57 #define LX_CLONE_SIGHAND 0x00000800 58 #define LX_CLONE_PID 0x00001000 59 #define LX_CLONE_PTRACE 0x00002000 60 #define LX_CLONE_VFORK 0x00004000 61 #define LX_CLONE_PARENT 0x00008000 62 #define LX_CLONE_THREAD 0x00010000 63 #define LX_CLONE_SYSVSEM 0x00040000 64 #define LX_CLONE_SETTLS 0x00080000 65 #define LX_CLONE_PARENT_SETTID 0x00100000 66 #define LX_CLONE_CHILD_CLEARTID 0x00200000 67 #define LX_CLONE_DETACH 0x00400000 68 #define LX_CLONE_CHILD_SETTID 0x01000000 69 70 #define SHARED_AS \ 71 (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND) 72 #define CLONE_VFORK (LX_CLONE_VM | LX_CLONE_VFORK) 73 #define CLONE_TD (LX_CLONE_THREAD|LX_CLONE_DETACH) 74 75 #define IS_FORK(f) (((f) & SHARED_AS) == 0) 76 #define IS_VFORK(f) (((f) & CLONE_VFORK) == CLONE_VFORK) 77 78 #define LX_EXIT 1 79 #define LX_EXIT_GROUP 2 80 81 /* 82 * This is dicey. This seems to be an internal glibc structure, and not 83 * part of any external interface. Thus, it is subject to change without 84 * notice. FWIW, clone(2) itself seems to be an internal (or at least 85 * unstable) interface, since strace(1) shows it differently than the man 86 * page. 87 */ 88 struct lx_desc 89 { 90 uint32_t entry_number; 91 uint32_t base_addr; 92 uint32_t limit; 93 uint32_t seg_32bit:1; 94 uint32_t contents:2; 95 uint32_t read_exec_only:1; 96 uint32_t limit_in_pages:1; 97 uint32_t seg_not_present:1; 98 uint32_t useable:1; 99 uint32_t empty:25; 100 }; 101 102 struct clone_state { 103 void *c_retaddr; /* instr after clone()'s int80 */ 104 int c_flags; /* flags to clone(2) */ 105 int c_sig; /* signal to send on thread exit */ 106 void *c_stk; /* %esp of new thread */ 107 void *c_ptidp; 108 struct lx_desc *c_ldtinfo; /* thread-specific segment */ 109 void *c_ctidp; 110 uintptr_t c_gs; /* Linux's %gs */ 111 sigset_t c_sigmask; /* signal mask */ 112 lx_affmask_t c_affmask; /* CPU affinity mask */ 113 volatile int *c_clone_res; /* pid/error returned to cloner */ 114 }; 115 116 extern void lx_setup_clone(uintptr_t, void *, void *); 117 118 /* 119 * Counter incremented when we vfork(2) ourselves, and decremented when the 120 * vfork(2)ed child exit(2)s or exec(2)s. 121 */ 122 static int is_vforked = 0; 123 124 int 125 lx_exit(uintptr_t p1) 126 { 127 int ret, status = (int)p1; 128 lx_tsd_t *lx_tsd; 129 130 /* 131 * If we are a vfork(2)ed child, we need to exit as quickly and 132 * cleanly as possible to avoid corrupting our parent. 133 */ 134 if (is_vforked != 0) { 135 is_vforked--; 136 _exit(status); 137 } 138 139 if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0) 140 lx_err_fatal(gettext( 141 "%s: unable to read thread-specific data: %s"), 142 "exit", strerror(ret)); 143 144 assert(lx_tsd != 0); 145 146 lx_tsd->lxtsd_exit = LX_EXIT; 147 lx_tsd->lxtsd_exit_status = status; 148 149 /* 150 * Block all signals in the exit context to avoid taking any signals 151 * (to the degree possible) while exiting. 152 */ 153 (void) sigfillset(&lx_tsd->lxtsd_exit_context.uc_sigmask); 154 155 /* 156 * This thread is exiting. Restore the state of the thread to 157 * what it was before we started running linux code. 158 */ 159 (void) setcontext(&lx_tsd->lxtsd_exit_context); 160 161 /* 162 * If we returned from the setcontext(2), something is very wrong. 163 */ 164 lx_err_fatal(gettext("%s: unable to set exit context: %s"), 165 "exit", strerror(errno)); 166 167 /*NOTREACHED*/ 168 return (0); 169 } 170 171 int 172 lx_group_exit(uintptr_t p1) 173 { 174 int ret, status = (int)p1; 175 lx_tsd_t *lx_tsd; 176 177 /* 178 * If we are a vfork(2)ed child, we need to exit as quickly and 179 * cleanly as possible to avoid corrupting our parent. 180 */ 181 if (is_vforked != 0) { 182 is_vforked--; 183 _exit(status); 184 } 185 186 if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0) 187 lx_err_fatal(gettext( 188 "%s: unable to read thread-specific data: %s"), 189 "group_exit", strerror(ret)); 190 191 assert(lx_tsd != 0); 192 193 lx_tsd->lxtsd_exit = LX_EXIT_GROUP; 194 lx_tsd->lxtsd_exit_status = status; 195 196 /* 197 * Block all signals in the exit context to avoid taking any signals 198 * (to the degree possible) while exiting. 199 */ 200 (void) sigfillset(&lx_tsd->lxtsd_exit_context.uc_sigmask); 201 202 /* 203 * This thread is exiting. Restore the state of the thread to 204 * what it was before we started running linux code. 205 */ 206 (void) setcontext(&lx_tsd->lxtsd_exit_context); 207 208 /* 209 * If we returned from the setcontext(2), something is very wrong. 210 */ 211 lx_err_fatal(gettext("%s: unable to set exit context: %s"), 212 "group_exit", strerror(errno)); 213 214 /*NOTREACHED*/ 215 return (0); 216 } 217 218 static void * 219 clone_start(void *arg) 220 { 221 int rval; 222 struct clone_state *cs = (struct clone_state *)arg; 223 lx_tsd_t lx_tsd; 224 225 /* 226 * Let the kernel finish setting up all the needed state for this 227 * new thread. 228 * 229 * We already created the thread using the thr_create(3C) library 230 * call, so most of the work required to emulate lx_clone(2) has 231 * been done by the time we get to this point. Instead of creating 232 * a new brandsys(2) subcommand to perform the last few bits of 233 * bookkeeping, we just use the lx_clone() slot in the syscall 234 * table. 235 */ 236 lx_debug("\tre-vectoring to lx kernel module to complete lx_clone()"); 237 lx_debug("\tLX_SYS_clone(0x%x, 0x%p, 0x%p, 0x%p, 0x%p)", 238 cs->c_flags, cs->c_stk, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp); 239 240 rval = syscall(SYS_brand, B_EMULATE_SYSCALL + LX_SYS_clone, 241 cs->c_flags, cs->c_stk, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp, 242 NULL); 243 244 /* 245 * At this point the parent is waiting for cs->c_clone_res to go 246 * non-zero to indicate the thread has been cloned. The value set 247 * in cs->c_clone_res will be used for the return value from 248 * clone(). 249 */ 250 if (rval < 0) { 251 *(cs->c_clone_res) = -errno; 252 lx_debug("\tkernel clone failed, errno %d\n", errno); 253 return (NULL); 254 } 255 256 if (lx_sched_setaffinity(0, sizeof (cs->c_affmask), 257 (uintptr_t)&cs->c_affmask) != 0) { 258 *(cs->c_clone_res) = -errno; 259 260 lx_err_fatal(gettext( 261 "Unable to set affinity mask in child thread: %s"), 262 strerror(errno)); 263 } 264 265 /* Initialize the thread specific data for this thread. */ 266 bzero(&lx_tsd, sizeof (lx_tsd)); 267 lx_tsd.lxtsd_gs = cs->c_gs; 268 269 /* 270 * Use the address of the stack-allocated lx_tsd as the 271 * per-thread storage area to cache various values for later 272 * use. 273 * 274 * This address is only used by this thread, so there is no 275 * danger of other threads using this storage area, nor of it 276 * being accessed once this stack frame has been freed. 277 */ 278 if (thr_setspecific(lx_tsd_key, &lx_tsd) != 0) { 279 *(cs->c_clone_res) = -errno; 280 lx_err_fatal( 281 gettext("Unable to set thread-specific ptr for clone: %s"), 282 strerror(rval)); 283 } 284 285 /* 286 * Save the current context of this thread. 287 * 288 * We'll restore this context when this thread attempts to exit. 289 */ 290 if (getcontext(&lx_tsd.lxtsd_exit_context) != 0) { 291 *(cs->c_clone_res) = -errno; 292 293 lx_err_fatal(gettext( 294 "Unable to initialize thread-specific exit context: %s"), 295 strerror(errno)); 296 } 297 298 /* 299 * Do the final stack twiddling, reset %gs, and return to the 300 * clone(2) path. 301 */ 302 if (lx_tsd.lxtsd_exit == 0) { 303 if (sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL) < 0) { 304 *(cs->c_clone_res) = -errno; 305 306 lx_err_fatal(gettext( 307 "Unable to release held signals for child " 308 "thread: %s"), strerror(errno)); 309 } 310 311 /* 312 * Let the parent know that the clone has (effectively) been 313 * completed. 314 */ 315 *(cs->c_clone_res) = rval; 316 317 lx_setup_clone(cs->c_gs, cs->c_retaddr, cs->c_stk); 318 319 /* lx_setup_clone() should never return. */ 320 assert(0); 321 } 322 323 /* 324 * We are here because the Linux application called the exit() or 325 * exit_group() system call. In turn the brand library did a 326 * setcontext() to jump to the thread context state saved in 327 * getcontext(), above. 328 */ 329 if (lx_tsd.lxtsd_exit == LX_EXIT) 330 thr_exit((void *)lx_tsd.lxtsd_exit_status); 331 else 332 exit(lx_tsd.lxtsd_exit_status); 333 334 assert(0); 335 /*NOTREACHED*/ 336 } 337 338 int 339 lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, 340 uintptr_t p5) 341 { 342 struct clone_state *cs; 343 int flags = (int)p1; 344 void *cldstk = (void *)p2; 345 void *ptidp = (void *)p3; 346 struct lx_desc *ldtinfo = (void *)p4; 347 void *ctidp = (void *)p5; 348 thread_t tid; 349 volatile int clone_res; 350 int sig; 351 int rval; 352 int pid; 353 lx_regs_t *rp; 354 sigset_t sigmask; 355 356 if (flags & LX_CLONE_SETTLS) { 357 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p ldt=0x%p " 358 "ctidp=0x%p", flags, cldstk, ptidp, ldtinfo, ctidp); 359 } else { 360 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p)", 361 flags, cldstk, ptidp); 362 } 363 364 /* 365 * Only supported for pid 0 on Linux 366 */ 367 if (flags & LX_CLONE_PID) 368 return (-EINVAL); 369 370 /* 371 * CLONE_THREAD requires CLONE_SIGHAND. 372 * 373 * CLONE_THREAD and CLONE_DETACHED must both be either set or cleared 374 * in kernel 2.4 and prior. 375 * In kernel 2.6 CLONE_DETACHED was dropped completely, so we no 376 * longer have this requirement. 377 */ 378 379 if (flags & CLONE_TD) { 380 if (!(flags & LX_CLONE_SIGHAND)) 381 return (-EINVAL); 382 if ((lx_get_kern_version() <= LX_KERN_2_4) && 383 (flags & CLONE_TD) != CLONE_TD) 384 return (-EINVAL); 385 } 386 387 rp = lx_syscall_regs(); 388 389 /* test if pointer passed by user are writable */ 390 if (flags & LX_CLONE_PARENT_SETTID) { 391 if (uucopy(ptidp, &pid, sizeof (int)) != 0) 392 return (-EFAULT); 393 if (uucopy(&pid, ptidp, sizeof (int)) != 0) 394 return (-EFAULT); 395 } 396 if (flags & LX_CLONE_CHILD_SETTID) { 397 if (uucopy(ctidp, &pid, sizeof (int)) != 0) 398 return (-EFAULT); 399 if (uucopy(&pid, ctidp, sizeof (int)) != 0) 400 return (-EFAULT); 401 } 402 403 /* See if this is a fork() operation or a thr_create(). */ 404 if (IS_FORK(flags) || IS_VFORK(flags)) { 405 if (flags & LX_CLONE_PARENT) { 406 lx_unsupported(gettext( 407 "clone(2) only supports CLONE_PARENT " 408 "for threads.\n")); 409 return (-ENOTSUP); 410 } 411 412 if (flags & LX_CLONE_PTRACE) 413 lx_ptrace_fork(); 414 415 if (flags & LX_CLONE_VFORK) { 416 is_vforked++; 417 rval = vfork(); 418 if (rval != 0) 419 is_vforked--; 420 } else { 421 rval = fork1(); 422 if (rval == 0 && lx_is_rpm) 423 (void) sleep(lx_rpm_delay); 424 } 425 426 /* 427 * Since we've already forked, we can't do much if uucopy fails, 428 * so we just ignore failure. Failure is unlikely since we've 429 * tested the memory before we did the fork. 430 */ 431 if (rval > 0 && (flags & LX_CLONE_PARENT_SETTID)) { 432 (void) uucopy(&rval, ptidp, sizeof (int)); 433 } 434 435 if (rval == 0 && (flags & LX_CLONE_CHILD_SETTID)) { 436 /* 437 * lx_getpid should not fail, and if it does, there's 438 * not much we can do about it since we've already 439 * forked, so on failure, we just don't copy the 440 * memory. 441 */ 442 pid = lx_getpid(); 443 if (pid >= 0) 444 (void) uucopy(&pid, ctidp, sizeof (int)); 445 } 446 447 /* Parent just returns */ 448 if (rval != 0) 449 return ((rval < 0) ? -errno : rval); 450 451 /* 452 * If provided, the child needs its new stack set up. 453 */ 454 if (cldstk) 455 lx_setup_clone(rp->lxr_gs, (void *)rp->lxr_eip, cldstk); 456 457 return (0); 458 } 459 460 /* 461 * We have very restricted support.... only exactly these flags are 462 * supported 463 */ 464 if (((flags & SHARED_AS) != SHARED_AS)) { 465 lx_unsupported(gettext( 466 "clone(2) requires that all or none of CLONE_VM " 467 "CLONE_FS, CLONE_FILES, and CLONE_SIGHAND be set.\n")); 468 return (-ENOTSUP); 469 } 470 471 if (cldstk == NULL) { 472 lx_unsupported(gettext( 473 "clone(2) requires the caller to allocate the " 474 "child's stack.\n")); 475 return (-ENOTSUP); 476 } 477 478 /* 479 * If we want a signal-on-exit, ensure that the signal is valid. 480 */ 481 if ((sig = ltos_signo[flags & LX_CSIGNAL]) == -1) { 482 lx_unsupported(gettext( 483 "clone(2) passed unsupported signal: %d"), sig); 484 return (-ENOTSUP); 485 } 486 487 /* 488 * To avoid malloc() here, we steal a part of the new thread's 489 * stack to store all the info that thread might need for 490 * initialization. We also make it 64-bit aligned for good 491 * measure. 492 */ 493 cs = (struct clone_state *) 494 ((p2 - sizeof (struct clone_state)) & -((uintptr_t)8)); 495 cs->c_flags = flags; 496 cs->c_sig = sig; 497 cs->c_stk = cldstk; 498 cs->c_ptidp = ptidp; 499 cs->c_ldtinfo = ldtinfo; 500 cs->c_ctidp = ctidp; 501 cs->c_clone_res = &clone_res; 502 cs->c_gs = rp->lxr_gs; 503 504 if (lx_sched_getaffinity(0, sizeof (cs->c_affmask), 505 (uintptr_t)&cs->c_affmask) == -1) 506 lx_err_fatal(gettext( 507 "Unable to get affinity mask for parent thread: %s"), 508 strerror(errno)); 509 510 /* 511 * We want the new thread to return directly to the return site for 512 * the system call. 513 */ 514 cs->c_retaddr = (void *)rp->lxr_eip; 515 clone_res = 0; 516 517 (void) sigfillset(&sigmask); 518 519 /* 520 * Block all signals because the thread we create won't be able to 521 * properly handle them until it's fully set up. 522 */ 523 if (sigprocmask(SIG_BLOCK, &sigmask, &cs->c_sigmask) < 0) { 524 lx_debug("lx_clone sigprocmask() failed: %s", strerror(errno)); 525 return (-errno); 526 } 527 528 rval = thr_create(NULL, NULL, clone_start, cs, THR_DETACHED, &tid); 529 530 /* 531 * Release any pending signals 532 */ 533 (void) sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL); 534 535 /* 536 * Wait for the child to be created and have its tid assigned. 537 */ 538 if (rval == 0) { 539 while (clone_res == 0) 540 ; 541 542 rval = clone_res; 543 } 544 545 return (rval); 546 }