1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #pragma ident   "%Z%%M% %I%     %E% SMI"
  28 
  29 #include <assert.h>
  30 #include <errno.h>
  31 #include <stdlib.h>
  32 #include <signal.h>
  33 #include <unistd.h>
  34 #include <ucontext.h>
  35 #include <thread.h>
  36 #include <strings.h>
  37 #include <libintl.h>
  38 #include <sys/regset.h>
  39 #include <sys/syscall.h>
  40 #include <sys/inttypes.h>
  41 #include <sys/param.h>
  42 #include <sys/types.h>
  43 #include <sys/segments.h>
  44 #include <signal.h>
  45 #include <sys/lx_misc.h>
  46 #include <sys/lx_types.h>
  47 #include <sys/lx_signal.h>
  48 #include <sys/lx_syscall.h>
  49 #include <sys/lx_brand.h>
  50 #include <sys/lx_debug.h>
  51 #include <sys/lx_thread.h>
  52 
  53 #define LX_CSIGNAL              0x000000ff
  54 #define LX_CLONE_VM             0x00000100
  55 #define LX_CLONE_FS             0x00000200
  56 #define LX_CLONE_FILES          0x00000400
  57 #define LX_CLONE_SIGHAND        0x00000800
  58 #define LX_CLONE_PID            0x00001000
  59 #define LX_CLONE_PTRACE         0x00002000
  60 #define LX_CLONE_VFORK          0x00004000
  61 #define LX_CLONE_PARENT         0x00008000
  62 #define LX_CLONE_THREAD         0x00010000
  63 #define LX_CLONE_SYSVSEM        0x00040000
  64 #define LX_CLONE_SETTLS         0x00080000
  65 #define LX_CLONE_PARENT_SETTID  0x00100000
  66 #define LX_CLONE_CHILD_CLEARTID 0x00200000
  67 #define LX_CLONE_DETACH         0x00400000
  68 #define LX_CLONE_CHILD_SETTID   0x01000000
  69 
  70 #define SHARED_AS       \
  71         (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND)
  72 #define CLONE_VFORK (LX_CLONE_VM | LX_CLONE_VFORK)
  73 #define CLONE_TD (LX_CLONE_THREAD|LX_CLONE_DETACH)
  74 
  75 #define IS_FORK(f)      (((f) & SHARED_AS) == 0)
  76 #define IS_VFORK(f)     (((f) & CLONE_VFORK) == CLONE_VFORK)
  77 
  78 #define LX_EXIT         1
  79 #define LX_EXIT_GROUP   2
  80 
  81 /*
  82  * This is dicey.  This seems to be an internal glibc structure, and not
  83  * part of any external interface.  Thus, it is subject to change without
  84  * notice.  FWIW, clone(2) itself seems to be an internal (or at least
  85  * unstable) interface, since strace(1) shows it differently than the man
  86  * page.
  87  */
  88 struct lx_desc
  89 {
  90         uint32_t entry_number;
  91         uint32_t base_addr;
  92         uint32_t limit;
  93         uint32_t seg_32bit:1;
  94         uint32_t contents:2;
  95         uint32_t read_exec_only:1;
  96         uint32_t limit_in_pages:1;
  97         uint32_t seg_not_present:1;
  98         uint32_t useable:1;
  99         uint32_t empty:25;
 100 };
 101 
 102 struct clone_state {
 103         void            *c_retaddr;     /* instr after clone()'s int80 */
 104         int             c_flags;        /* flags to clone(2) */
 105         int             c_sig;          /* signal to send on thread exit */
 106         void            *c_stk;         /* %esp of new thread */
 107         void            *c_ptidp;
 108         struct lx_desc  *c_ldtinfo;     /* thread-specific segment */
 109         void            *c_ctidp;
 110         uintptr_t       c_gs;           /* Linux's %gs */
 111         sigset_t        c_sigmask;      /* signal mask */
 112         lx_affmask_t    c_affmask;      /* CPU affinity mask */
 113         volatile int    *c_clone_res;   /* pid/error returned to cloner */
 114 };
 115 
 116 extern void lx_setup_clone(uintptr_t, void *, void *);
 117 
 118 /*
 119  * Counter incremented when we vfork(2) ourselves, and decremented when the
 120  * vfork(2)ed child exit(2)s or exec(2)s.
 121  */
 122 static int is_vforked = 0;
 123 
 124 int
 125 lx_exit(uintptr_t p1)
 126 {
 127         int             ret, status = (int)p1;
 128         lx_tsd_t        *lx_tsd;
 129 
 130         /*
 131          * If we are a vfork(2)ed child, we need to exit as quickly and
 132          * cleanly as possible to avoid corrupting our parent.
 133          */
 134         if (is_vforked != 0) {
 135                 is_vforked--;
 136                 _exit(status);
 137         }
 138 
 139         if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0)
 140                 lx_err_fatal(gettext(
 141                     "%s: unable to read thread-specific data: %s"),
 142                     "exit", strerror(ret));
 143 
 144         assert(lx_tsd != 0);
 145 
 146         lx_tsd->lxtsd_exit = LX_EXIT;
 147         lx_tsd->lxtsd_exit_status = status;
 148 
 149         /*
 150          * Block all signals in the exit context to avoid taking any signals
 151          * (to the degree possible) while exiting.
 152          */
 153         (void) sigfillset(&lx_tsd->lxtsd_exit_context.uc_sigmask);
 154 
 155         /*
 156          * This thread is exiting.  Restore the state of the thread to
 157          * what it was before we started running linux code.
 158          */
 159         (void) setcontext(&lx_tsd->lxtsd_exit_context);
 160 
 161         /*
 162          * If we returned from the setcontext(2), something is very wrong.
 163          */
 164         lx_err_fatal(gettext("%s: unable to set exit context: %s"),
 165             "exit", strerror(errno));
 166 
 167         /*NOTREACHED*/
 168         return (0);
 169 }
 170 
 171 int
 172 lx_group_exit(uintptr_t p1)
 173 {
 174         int             ret, status = (int)p1;
 175         lx_tsd_t        *lx_tsd;
 176 
 177         /*
 178          * If we are a vfork(2)ed child, we need to exit as quickly and
 179          * cleanly as possible to avoid corrupting our parent.
 180          */
 181         if (is_vforked != 0) {
 182                 is_vforked--;
 183                 _exit(status);
 184         }
 185 
 186         if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0)
 187                 lx_err_fatal(gettext(
 188                     "%s: unable to read thread-specific data: %s"),
 189                     "group_exit", strerror(ret));
 190 
 191         assert(lx_tsd != 0);
 192 
 193         lx_tsd->lxtsd_exit = LX_EXIT_GROUP;
 194         lx_tsd->lxtsd_exit_status = status;
 195 
 196         /*
 197          * Block all signals in the exit context to avoid taking any signals
 198          * (to the degree possible) while exiting.
 199          */
 200         (void) sigfillset(&lx_tsd->lxtsd_exit_context.uc_sigmask);
 201 
 202         /*
 203          * This thread is exiting.  Restore the state of the thread to
 204          * what it was before we started running linux code.
 205          */
 206         (void) setcontext(&lx_tsd->lxtsd_exit_context);
 207 
 208         /*
 209          * If we returned from the setcontext(2), something is very wrong.
 210          */
 211         lx_err_fatal(gettext("%s: unable to set exit context: %s"),
 212             "group_exit", strerror(errno));
 213 
 214         /*NOTREACHED*/
 215         return (0);
 216 }
 217 
 218 static void *
 219 clone_start(void *arg)
 220 {
 221         int rval;
 222         struct clone_state *cs = (struct clone_state *)arg;
 223         lx_tsd_t lx_tsd;
 224 
 225         /*
 226          * Let the kernel finish setting up all the needed state for this
 227          * new thread.
 228          *
 229          * We already created the thread using the thr_create(3C) library
 230          * call, so most of the work required to emulate lx_clone(2) has
 231          * been done by the time we get to this point.  Instead of creating
 232          * a new brandsys(2) subcommand to perform the last few bits of
 233          * bookkeeping, we just use the lx_clone() slot in the syscall
 234          * table.
 235          */
 236         lx_debug("\tre-vectoring to lx kernel module to complete lx_clone()");
 237         lx_debug("\tLX_SYS_clone(0x%x, 0x%p, 0x%p, 0x%p, 0x%p)",
 238             cs->c_flags, cs->c_stk, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp);
 239 
 240         rval = syscall(SYS_brand, B_EMULATE_SYSCALL + LX_SYS_clone,
 241             cs->c_flags, cs->c_stk, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp,
 242             NULL);
 243 
 244         /*
 245          * At this point the parent is waiting for cs->c_clone_res to go
 246          * non-zero to indicate the thread has been cloned.  The value set
 247          * in cs->c_clone_res will be used for the return value from
 248          * clone().
 249          */
 250         if (rval < 0) {
 251                 *(cs->c_clone_res) = -errno;
 252                 lx_debug("\tkernel clone failed, errno %d\n", errno);
 253                 return (NULL);
 254         }
 255 
 256         if (lx_sched_setaffinity(0, sizeof (cs->c_affmask),
 257             (uintptr_t)&cs->c_affmask) != 0) {
 258                 *(cs->c_clone_res) = -errno;
 259 
 260                 lx_err_fatal(gettext(
 261                     "Unable to set affinity mask in child thread: %s"),
 262                     strerror(errno));
 263         }
 264 
 265         /* Initialize the thread specific data for this thread. */
 266         bzero(&lx_tsd, sizeof (lx_tsd));
 267         lx_tsd.lxtsd_gs = cs->c_gs;
 268 
 269         /*
 270          * Use the address of the stack-allocated lx_tsd as the
 271          * per-thread storage area to cache various values for later
 272          * use.
 273          *
 274          * This address is only used by this thread, so there is no
 275          * danger of other threads using this storage area, nor of it
 276          * being accessed once this stack frame has been freed.
 277          */
 278         if (thr_setspecific(lx_tsd_key, &lx_tsd) != 0) {
 279                 *(cs->c_clone_res) = -errno;
 280                 lx_err_fatal(
 281                     gettext("Unable to set thread-specific ptr for clone: %s"),
 282                     strerror(rval));
 283         }
 284 
 285         /*
 286          * Save the current context of this thread.
 287          *
 288          * We'll restore this context when this thread attempts to exit.
 289          */
 290         if (getcontext(&lx_tsd.lxtsd_exit_context) != 0) {
 291                 *(cs->c_clone_res) = -errno;
 292 
 293                 lx_err_fatal(gettext(
 294                     "Unable to initialize thread-specific exit context: %s"),
 295                     strerror(errno));
 296         }
 297 
 298         /*
 299          * Do the final stack twiddling, reset %gs, and return to the
 300          * clone(2) path.
 301          */
 302         if (lx_tsd.lxtsd_exit == 0) {
 303                 if (sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL) < 0) {
 304                         *(cs->c_clone_res) = -errno;
 305 
 306                         lx_err_fatal(gettext(
 307                             "Unable to release held signals for child "
 308                             "thread: %s"), strerror(errno));
 309                 }
 310 
 311                 /*
 312                  * Let the parent know that the clone has (effectively) been
 313                  * completed.
 314                  */
 315                 *(cs->c_clone_res) = rval;
 316 
 317                 lx_setup_clone(cs->c_gs, cs->c_retaddr, cs->c_stk);
 318 
 319                 /* lx_setup_clone() should never return. */
 320                 assert(0);
 321         }
 322 
 323         /*
 324          * We are here because the Linux application called the exit() or
 325          * exit_group() system call.  In turn the brand library did a
 326          * setcontext() to jump to the thread context state saved in
 327          * getcontext(), above.
 328          */
 329         if (lx_tsd.lxtsd_exit == LX_EXIT)
 330                 thr_exit((void *)lx_tsd.lxtsd_exit_status);
 331         else
 332                 exit(lx_tsd.lxtsd_exit_status);
 333 
 334         assert(0);
 335         /*NOTREACHED*/
 336 }
 337 
 338 int
 339 lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
 340         uintptr_t p5)
 341 {
 342         struct clone_state *cs;
 343         int flags = (int)p1;
 344         void *cldstk = (void *)p2;
 345         void *ptidp = (void *)p3;
 346         struct lx_desc *ldtinfo = (void *)p4;
 347         void *ctidp = (void *)p5;
 348         thread_t tid;
 349         volatile int clone_res;
 350         int sig;
 351         int rval;
 352         int pid;
 353         lx_regs_t *rp;
 354         sigset_t sigmask;
 355 
 356         if (flags & LX_CLONE_SETTLS) {
 357                 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p ldt=0x%p "
 358                     "ctidp=0x%p", flags, cldstk, ptidp, ldtinfo, ctidp);
 359         } else {
 360                 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p)",
 361                     flags, cldstk, ptidp);
 362         }
 363 
 364         /*
 365          * Only supported for pid 0 on Linux
 366          */
 367         if (flags & LX_CLONE_PID)
 368                 return (-EINVAL);
 369 
 370         /*
 371          * CLONE_THREAD requires CLONE_SIGHAND.
 372          *
 373          * CLONE_THREAD and CLONE_DETACHED must both be either set or cleared
 374          * in kernel 2.4 and prior.
 375          * In kernel 2.6 CLONE_DETACHED was dropped completely, so we no
 376          * longer have this requirement.
 377          */
 378 
 379         if (flags & CLONE_TD) {
 380                 if (!(flags & LX_CLONE_SIGHAND))
 381                         return (-EINVAL);
 382                 if ((lx_get_kern_version() <= LX_KERN_2_4) &&
 383                     (flags & CLONE_TD) != CLONE_TD)
 384                         return (-EINVAL);
 385         }
 386 
 387         rp = lx_syscall_regs();
 388 
 389         /* test if pointer passed by user are writable */
 390         if (flags & LX_CLONE_PARENT_SETTID) {
 391                 if (uucopy(ptidp, &pid, sizeof (int)) != 0)
 392                         return (-EFAULT);
 393                 if (uucopy(&pid, ptidp, sizeof (int)) != 0)
 394                         return (-EFAULT);
 395         }
 396         if (flags & LX_CLONE_CHILD_SETTID) {
 397                 if (uucopy(ctidp, &pid, sizeof (int)) != 0)
 398                         return (-EFAULT);
 399                 if (uucopy(&pid, ctidp, sizeof (int)) != 0)
 400                         return (-EFAULT);
 401         }
 402 
 403         /* See if this is a fork() operation or a thr_create().  */
 404         if (IS_FORK(flags) || IS_VFORK(flags)) {
 405                 if (flags & LX_CLONE_PARENT) {
 406                         lx_unsupported(gettext(
 407                             "clone(2) only supports CLONE_PARENT "
 408                             "for threads.\n"));
 409                         return (-ENOTSUP);
 410                 }
 411 
 412                 if (flags & LX_CLONE_PTRACE)
 413                         lx_ptrace_fork();
 414 
 415                 if (flags & LX_CLONE_VFORK) {
 416                         is_vforked++;
 417                         rval = vfork();
 418                         if (rval != 0)
 419                                 is_vforked--;
 420                 } else {
 421                         rval = fork1();
 422                         if (rval == 0 && lx_is_rpm)
 423                                 (void) sleep(lx_rpm_delay);
 424                 }
 425 
 426                 /*
 427                  * Since we've already forked, we can't do much if uucopy fails,
 428                  * so we just ignore failure. Failure is unlikely since we've
 429                  * tested the memory before we did the fork.
 430                  */
 431                 if (rval > 0 && (flags & LX_CLONE_PARENT_SETTID)) {
 432                         (void) uucopy(&rval, ptidp, sizeof (int));
 433                 }
 434 
 435                 if (rval == 0 && (flags & LX_CLONE_CHILD_SETTID)) {
 436                         /*
 437                          * lx_getpid should not fail, and if it does, there's
 438                          * not much we can do about it since we've already
 439                          * forked, so on failure, we just don't copy the
 440                          * memory.
 441                          */
 442                         pid = lx_getpid();
 443                         if (pid >= 0)
 444                                 (void) uucopy(&pid, ctidp, sizeof (int));
 445                 }
 446 
 447                 /* Parent just returns */
 448                 if (rval != 0)
 449                         return ((rval < 0) ? -errno : rval);
 450 
 451                 /*
 452                  * If provided, the child needs its new stack set up.
 453                  */
 454                 if (cldstk)
 455                         lx_setup_clone(rp->lxr_gs, (void *)rp->lxr_eip, cldstk);
 456 
 457                 return (0);
 458         }
 459 
 460         /*
 461          * We have very restricted support.... only exactly these flags are
 462          * supported
 463          */
 464         if (((flags & SHARED_AS) != SHARED_AS)) {
 465                 lx_unsupported(gettext(
 466                     "clone(2) requires that all or none of CLONE_VM "
 467                     "CLONE_FS, CLONE_FILES, and CLONE_SIGHAND be set.\n"));
 468                 return (-ENOTSUP);
 469         }
 470 
 471         if (cldstk == NULL) {
 472                 lx_unsupported(gettext(
 473                     "clone(2) requires the caller to allocate the "
 474                     "child's stack.\n"));
 475                 return (-ENOTSUP);
 476         }
 477 
 478         /*
 479          * If we want a signal-on-exit, ensure that the signal is valid.
 480          */
 481         if ((sig = ltos_signo[flags & LX_CSIGNAL]) == -1) {
 482                 lx_unsupported(gettext(
 483                     "clone(2) passed unsupported signal: %d"), sig);
 484                 return (-ENOTSUP);
 485         }
 486 
 487         /*
 488          * To avoid malloc() here, we steal a part of the new thread's
 489          * stack to store all the info that thread might need for
 490          * initialization.  We also make it 64-bit aligned for good
 491          * measure.
 492          */
 493         cs = (struct clone_state *)
 494             ((p2 - sizeof (struct clone_state)) & -((uintptr_t)8));
 495         cs->c_flags = flags;
 496         cs->c_sig = sig;
 497         cs->c_stk = cldstk;
 498         cs->c_ptidp = ptidp;
 499         cs->c_ldtinfo = ldtinfo;
 500         cs->c_ctidp = ctidp;
 501         cs->c_clone_res = &clone_res;
 502         cs->c_gs = rp->lxr_gs;
 503 
 504         if (lx_sched_getaffinity(0, sizeof (cs->c_affmask),
 505             (uintptr_t)&cs->c_affmask) == -1)
 506                 lx_err_fatal(gettext(
 507                     "Unable to get affinity mask for parent thread: %s"),
 508                     strerror(errno));
 509 
 510         /*
 511          * We want the new thread to return directly to the return site for
 512          * the system call.
 513          */
 514         cs->c_retaddr = (void *)rp->lxr_eip;
 515         clone_res = 0;
 516 
 517         (void) sigfillset(&sigmask);
 518 
 519         /*
 520          * Block all signals because the thread we create won't be able to
 521          * properly handle them until it's fully set up.
 522          */
 523         if (sigprocmask(SIG_BLOCK, &sigmask, &cs->c_sigmask) < 0) {
 524                 lx_debug("lx_clone sigprocmask() failed: %s", strerror(errno));
 525                 return (-errno);
 526         }
 527 
 528         rval = thr_create(NULL, NULL, clone_start, cs, THR_DETACHED, &tid);
 529 
 530         /*
 531          * Release any pending signals
 532          */
 533         (void) sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL);
 534 
 535         /*
 536          * Wait for the child to be created and have its tid assigned.
 537          */
 538         if (rval == 0) {
 539                 while (clone_res == 0)
 540                         ;
 541 
 542                 rval = clone_res;
 543         }
 544 
 545         return (rval);
 546 }