1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 #include <sys/param.h>
  27 #include <sys/vmparam.h>
  28 #include <sys/types.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/systm.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/signal.h>
  33 #include <sys/stack.h>
  34 #include <sys/cred.h>
  35 #include <sys/user.h>
  36 #include <sys/debug.h>
  37 #include <sys/errno.h>
  38 #include <sys/proc.h>
  39 #include <sys/var.h>
  40 #include <sys/inline.h>
  41 #include <sys/syscall.h>
  42 #include <sys/ucontext.h>
  43 #include <sys/cpuvar.h>
  44 #include <sys/siginfo.h>
  45 #include <sys/trap.h>
  46 #include <sys/machtrap.h>
  47 #include <sys/sysinfo.h>
  48 #include <sys/procfs.h>
  49 #include <sys/prsystm.h>
  50 #include <sys/fpu/fpusystm.h>
  51 #include <sys/modctl.h>
  52 #include <sys/aio_impl.h>
  53 #include <c2/audit.h>
  54 #include <sys/tnf.h>
  55 #include <sys/tnf_probe.h>
  56 #include <sys/machpcb.h>
  57 #include <sys/privregs.h>
  58 #include <sys/copyops.h>
  59 #include <sys/timer.h>
  60 #include <sys/priv.h>
  61 #include <sys/msacct.h>
  62 
  63 int syscalltrace = 0;
  64 #ifdef SYSCALLTRACE
  65 static kmutex_t systrace_lock;          /* syscall tracing lock */
  66 #endif /* SYSCALLTRACE */
  67 
  68 static krwlock_t *lock_syscall(struct sysent *, uint_t);
  69 
  70 #ifdef _SYSCALL32_IMPL
  71 static struct sysent *
  72 lwp_getsysent(klwp_t *lwp)
  73 {
  74         if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE)
  75                 return (sysent);
  76         return (sysent32);
  77 }
  78 #define LWP_GETSYSENT(lwp)      (lwp_getsysent(lwp))
  79 #else
  80 #define LWP_GETSYSENT(lwp)      (sysent)
  81 #endif
  82 
  83 /*
  84  * Called to restore the lwp's register window just before
  85  * returning to user level (only if the registers have been
  86  * fetched or modified through /proc).
  87  */
  88 /*ARGSUSED1*/
  89 void
  90 xregrestore(klwp_t *lwp, int shared)
  91 {
  92         /*
  93          * If locals+ins were modified by /proc copy them out.
  94          * Also copy to the shared window, if necessary.
  95          */
  96         if (lwp->lwp_pcb.pcb_xregstat == XREGMODIFIED) {
  97                 struct machpcb *mpcb = lwptompcb(lwp);
  98                 caddr_t sp = (caddr_t)lwptoregs(lwp)->r_sp;
  99 
 100                 size_t rwinsize;
 101                 caddr_t rwp;
 102                 int is64;
 103 
 104                 if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) {
 105                         rwinsize = sizeof (struct rwindow);
 106                         rwp = sp + STACK_BIAS;
 107                         is64 = 1;
 108                 } else {
 109                         rwinsize = sizeof (struct rwindow32);
 110                         sp = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)sp;
 111                         rwp = sp;
 112                         is64 = 0;
 113                 }
 114 
 115                 if (is64)
 116                         (void) copyout_nowatch(&lwp->lwp_pcb.pcb_xregs,
 117                             rwp, rwinsize);
 118                 else {
 119                         struct rwindow32 rwindow32;
 120                         int watched;
 121 
 122                         watched = watch_disable_addr(rwp, rwinsize, S_WRITE);
 123                         rwindow_nto32(&lwp->lwp_pcb.pcb_xregs, &rwindow32);
 124                         (void) copyout(&rwindow32, rwp, rwinsize);
 125                         if (watched)
 126                                 watch_enable_addr(rwp, rwinsize, S_WRITE);
 127                 }
 128 
 129                 /* also copy to the user return window */
 130                 mpcb->mpcb_rsp[0] = sp;
 131                 mpcb->mpcb_rsp[1] = NULL;
 132                 bcopy(&lwp->lwp_pcb.pcb_xregs, &mpcb->mpcb_rwin[0],
 133                     sizeof (lwp->lwp_pcb.pcb_xregs));
 134         }
 135         lwp->lwp_pcb.pcb_xregstat = XREGNONE;
 136 }
 137 
 138 
 139 /*
 140  * Get the arguments to the current system call.
 141  *      lwp->lwp_ap normally points to the out regs in the reg structure.
 142  *      If the user is going to change the out registers and might want to
 143  *      get the args (for /proc tracing), it must copy the args elsewhere
 144  *      via save_syscall_args().
 145  */
 146 uint_t
 147 get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
 148 {
 149         kthread_t       *t = lwptot(lwp);
 150         uint_t  code = t->t_sysnum;
 151         long    mask;
 152         long    *ap;
 153         int     nargs;
 154 
 155         if (lwptoproc(lwp)->p_model == DATAMODEL_ILP32)
 156                 mask = (uint32_t)0xffffffffU;
 157         else
 158                 mask = 0xffffffffffffffff;
 159 
 160         if (code != 0 && code < NSYSCALL) {
 161 
 162                 nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
 163 
 164                 ASSERT(nargs <= MAXSYSARGS);
 165 
 166                 *nargsp = nargs;
 167                 ap = lwp->lwp_ap;
 168                 while (nargs-- > 0)
 169                         *argp++ = *ap++ & mask;
 170         } else {
 171                 *nargsp = 0;
 172         }
 173         return (code);
 174 }
 175 
 176 #ifdef _SYSCALL32_IMPL
 177 /*
 178  * Get the arguments to the current 32-bit system call.
 179  */
 180 uint_t
 181 get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
 182 {
 183         long args[MAXSYSARGS];
 184         uint_t i, code;
 185 
 186         code = get_syscall_args(lwp, args, nargsp);
 187         for (i = 0; i != *nargsp; i++)
 188                 *argp++ = (int)args[i];
 189         return (code);
 190 }
 191 #endif
 192 
 193 /*
 194  *      Save the system call arguments in a safe place.
 195  *      lwp->lwp_ap normally points to the out regs in the reg structure.
 196  *      If the user is going to change the out registers, g1, or the stack,
 197  *      and might want to get the args (for /proc tracing), it must copy
 198  *      the args elsewhere via save_syscall_args().
 199  *
 200  *      This may be called from stop() even when we're not in a system call.
 201  *      Since there's no easy way to tell, this must be safe (not panic).
 202  *      If the copyins get data faults, return non-zero.
 203  */
 204 int
 205 save_syscall_args()
 206 {
 207         kthread_t       *t = curthread;
 208         klwp_t          *lwp = ttolwp(t);
 209         struct regs     *rp = lwptoregs(lwp);
 210         uint_t          code = t->t_sysnum;
 211         uint_t          nargs;
 212         int             i;
 213         caddr_t         ua;
 214         model_t         datamodel;
 215 
 216         if (lwp->lwp_argsaved || code == 0)
 217                 return (0);             /* args already saved or not needed */
 218 
 219         if (code >= NSYSCALL) {
 220                 nargs = 0;              /* illegal syscall */
 221         } else {
 222                 struct sysent *se = LWP_GETSYSENT(lwp);
 223                 struct sysent *callp = se + code;
 224 
 225                 nargs = callp->sy_narg;
 226                 if (LOADABLE_SYSCALL(callp) && nargs == 0) {
 227                         krwlock_t       *module_lock;
 228 
 229                         /*
 230                          * Find out how many arguments the system
 231                          * call uses.
 232                          *
 233                          * We have the property that loaded syscalls
 234                          * never change the number of arguments they
 235                          * use after they've been loaded once.  This
 236                          * allows us to stop for /proc tracing without
 237                          * holding the module lock.
 238                          * /proc is assured that sy_narg is valid.
 239                          */
 240                         module_lock = lock_syscall(se, code);
 241                         nargs = callp->sy_narg;
 242                         rw_exit(module_lock);
 243                 }
 244         }
 245 
 246         /*
 247          * Fetch the system call arguments.
 248          */
 249         if (nargs == 0)
 250                 goto out;
 251 
 252 
 253         ASSERT(nargs <= MAXSYSARGS);
 254 
 255         if ((datamodel = lwp_getdatamodel(lwp)) == DATAMODEL_ILP32) {
 256 
 257                 if (rp->r_g1 == 0) { /* indirect syscall */
 258 
 259                         lwp->lwp_arg[0] = (uint32_t)rp->r_o1;
 260                         lwp->lwp_arg[1] = (uint32_t)rp->r_o2;
 261                         lwp->lwp_arg[2] = (uint32_t)rp->r_o3;
 262                         lwp->lwp_arg[3] = (uint32_t)rp->r_o4;
 263                         lwp->lwp_arg[4] = (uint32_t)rp->r_o5;
 264                         if (nargs > 5) {
 265                                 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
 266                                     (rp->r_sp + MINFRAME32);
 267                                 for (i = 5; i < nargs; i++) {
 268                                         uint32_t a;
 269                                         if (fuword32(ua, &a) != 0)
 270                                                 return (-1);
 271                                         lwp->lwp_arg[i] = a;
 272                                         ua += sizeof (a);
 273                                 }
 274                         }
 275                 } else {
 276                         lwp->lwp_arg[0] = (uint32_t)rp->r_o0;
 277                         lwp->lwp_arg[1] = (uint32_t)rp->r_o1;
 278                         lwp->lwp_arg[2] = (uint32_t)rp->r_o2;
 279                         lwp->lwp_arg[3] = (uint32_t)rp->r_o3;
 280                         lwp->lwp_arg[4] = (uint32_t)rp->r_o4;
 281                         lwp->lwp_arg[5] = (uint32_t)rp->r_o5;
 282                         if (nargs > 6) {
 283                                 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
 284                                     (rp->r_sp + MINFRAME32);
 285                                 for (i = 6; i < nargs; i++) {
 286                                         uint32_t a;
 287                                         if (fuword32(ua, &a) != 0)
 288                                                 return (-1);
 289                                         lwp->lwp_arg[i] = a;
 290                                         ua += sizeof (a);
 291                                 }
 292                         }
 293                 }
 294         } else {
 295                 ASSERT(datamodel == DATAMODEL_LP64);
 296                 lwp->lwp_arg[0] = rp->r_o0;
 297                 lwp->lwp_arg[1] = rp->r_o1;
 298                 lwp->lwp_arg[2] = rp->r_o2;
 299                 lwp->lwp_arg[3] = rp->r_o3;
 300                 lwp->lwp_arg[4] = rp->r_o4;
 301                 lwp->lwp_arg[5] = rp->r_o5;
 302                 if (nargs > 6) {
 303                         ua = (caddr_t)rp->r_sp + MINFRAME + STACK_BIAS;
 304                         for (i = 6; i < nargs; i++) {
 305                                 unsigned long a;
 306                                 if (fulword(ua, &a) != 0)
 307                                         return (-1);
 308                                 lwp->lwp_arg[i] = a;
 309                                 ua += sizeof (a);
 310                         }
 311                 }
 312         }
 313 
 314 out:
 315         lwp->lwp_ap = lwp->lwp_arg;
 316         lwp->lwp_argsaved = 1;
 317         t->t_post_sys = 1;   /* so lwp_ap will be reset */
 318         return (0);
 319 }
 320 
 321 void
 322 reset_syscall_args(void)
 323 {
 324         klwp_t *lwp = ttolwp(curthread);
 325 
 326         lwp->lwp_ap = (long *)&lwptoregs(lwp)->r_o0;
 327         lwp->lwp_argsaved = 0;
 328 }
 329 
 330 /*
 331  * nonexistent system call-- signal lwp (may want to handle it)
 332  * flag error if lwp won't see signal immediately
 333  * This works for old or new calling sequence.
 334  */
 335 int64_t
 336 nosys()
 337 {
 338         tsignal(curthread, SIGSYS);
 339         return ((int64_t)set_errno(ENOSYS));
 340 }
 341 
 342 /*
 343  * Perform pre-system-call processing, including stopping for tracing,
 344  * auditing, microstate-accounting, etc.
 345  *
 346  * This routine is called only if the t_pre_sys flag is set.  Any condition
 347  * requiring pre-syscall handling must set the t_pre_sys flag.  If the
 348  * condition is persistent, this routine will repost t_pre_sys.
 349  */
 350 int
 351 pre_syscall(int arg0)
 352 {
 353         unsigned int code;
 354         kthread_t *t = curthread;
 355         proc_t *p = ttoproc(t);
 356         klwp_t *lwp = ttolwp(t);
 357         struct regs *rp = lwptoregs(lwp);
 358         int     repost;
 359 
 360         t->t_pre_sys = repost = 0;   /* clear pre-syscall processing flag */
 361 
 362         ASSERT(t->t_schedflag & TS_DONT_SWAP);
 363 
 364         syscall_mstate(LMS_USER, LMS_SYSTEM);
 365 
 366         /*
 367          * The syscall arguments in the out registers should be pointed to
 368          * by lwp_ap.  If the args need to be copied so that the outs can
 369          * be changed without losing the ability to get the args for /proc,
 370          * they can be saved by save_syscall_args(), and lwp_ap will be
 371          * restored by post_syscall().
 372          */
 373         ASSERT(lwp->lwp_ap == (long *)&rp->r_o0);
 374 
 375         /*
 376          * Make sure the thread is holding the latest credentials for the
 377          * process.  The credentials in the process right now apply to this
 378          * thread for the entire system call.
 379          */
 380         if (t->t_cred != p->p_cred) {
 381                 cred_t *oldcred = t->t_cred;
 382                 /*
 383                  * DTrace accesses t_cred in probe context.  t_cred must
 384                  * always be either NULL, or point to a valid, allocated cred
 385                  * structure.
 386                  */
 387                 t->t_cred = crgetcred();
 388                 crfree(oldcred);
 389         }
 390 
 391         /*
 392          * Undo special arrangements to single-step the lwp
 393          * so that a debugger will see valid register contents.
 394          * Also so that the pc is valid for syncfpu().
 395          * Also so that a syscall like exec() can be stepped.
 396          */
 397         if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
 398                 (void) prundostep();
 399                 repost = 1;
 400         }
 401 
 402         /*
 403          * Check for indirect system call in case we stop for tracing.
 404          * Don't allow multiple indirection.
 405          */
 406         code = t->t_sysnum;
 407         if (code == 0 && arg0 != 0) {           /* indirect syscall */
 408                 code = arg0;
 409                 t->t_sysnum = arg0;
 410         }
 411 
 412         /*
 413          * From the proc(4) manual page:
 414          * When entry to a system call is being traced, the traced process
 415          * stops after having begun the call to the system but before the
 416          * system call arguments have been fetched from the process.
 417          * If proc changes the args we must refetch them after starting.
 418          */
 419         if (PTOU(p)->u_systrap) {
 420                 if (prismember(&PTOU(p)->u_entrymask, code)) {
 421                         /*
 422                          * Recheck stop condition, now that lock is held.
 423                          */
 424                         mutex_enter(&p->p_lock);
 425                         if (PTOU(p)->u_systrap &&
 426                             prismember(&PTOU(p)->u_entrymask, code)) {
 427                                 stop(PR_SYSENTRY, code);
 428                                 /*
 429                                  * Must refetch args since they were
 430                                  * possibly modified by /proc.  Indicate
 431                                  * that the valid copy is in the
 432                                  * registers.
 433                                  */
 434                                 lwp->lwp_argsaved = 0;
 435                                 lwp->lwp_ap = (long *)&rp->r_o0;
 436                         }
 437                         mutex_exit(&p->p_lock);
 438                 }
 439                 repost = 1;
 440         }
 441 
 442         if (lwp->lwp_sysabort) {
 443                 /*
 444                  * lwp_sysabort may have been set via /proc while the process
 445                  * was stopped on PR_SYSENTRY.  If so, abort the system call.
 446                  * Override any error from the copyin() of the arguments.
 447                  */
 448                 lwp->lwp_sysabort = 0;
 449                 (void) set_errno(EINTR); /* sets post-sys processing */
 450                 t->t_pre_sys = 1;    /* repost anyway */
 451                 return (1);             /* don't do system call, return EINTR */
 452         }
 453 
 454         /* begin auditing for this syscall */
 455         if (audit_active == C2AUDIT_LOADED) {
 456                 uint32_t auditing = au_zone_getstate(NULL);
 457 
 458                 if (auditing & AU_AUDIT_MASK) {
 459                         int error;
 460                         if (error = audit_start(T_SYSCALL, code, auditing, \
 461                             0, lwp)) {
 462                                 t->t_pre_sys = 1;    /* repost anyway */
 463                                 lwp->lwp_error = 0;  /* for old drivers */
 464                                 return (error);
 465                         }
 466                         repost = 1;
 467                 }
 468         }
 469 
 470 #ifndef NPROBE
 471         /* Kernel probe */
 472         if (tnf_tracing_active) {
 473                 TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
 474                         tnf_sysnum,     sysnum,         t->t_sysnum);
 475                 t->t_post_sys = 1;   /* make sure post_syscall runs */
 476                 repost = 1;
 477         }
 478 #endif /* NPROBE */
 479 
 480 #ifdef SYSCALLTRACE
 481         if (syscalltrace) {
 482                 int i;
 483                 long *ap;
 484                 char *cp;
 485                 char *sysname;
 486                 struct sysent *callp;
 487 
 488                 if (code >= NSYSCALL)
 489                         callp = &nosys_ent; /* nosys has no args */
 490                 else
 491                         callp = LWP_GETSYSENT(lwp) + code;
 492                 (void) save_syscall_args();
 493                 mutex_enter(&systrace_lock);
 494                 printf("%d: ", p->p_pid);
 495                 if (code >= NSYSCALL)
 496                         printf("0x%x", code);
 497                 else {
 498                         sysname = mod_getsysname(code);
 499                         printf("%s[0x%x]", sysname == NULL ? "NULL" :
 500                             sysname, code);
 501                 }
 502                 cp = "(";
 503                 for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
 504                         printf("%s%lx", cp, *ap);
 505                         cp = ", ";
 506                 }
 507                 if (i)
 508                         printf(")");
 509                 printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
 510                 mutex_exit(&systrace_lock);
 511         }
 512 #endif /* SYSCALLTRACE */
 513 
 514         /*
 515          * If there was a continuing reason for pre-syscall processing,
 516          * set the t_pre_sys flag for the next system call.
 517          */
 518         if (repost)
 519                 t->t_pre_sys = 1;
 520         lwp->lwp_error = 0;  /* for old drivers */
 521         lwp->lwp_badpriv = PRIV_NONE;        /* for privilege tracing */
 522         return (0);
 523 }
 524 
 525 /*
 526  * Post-syscall processing.  Perform abnormal system call completion
 527  * actions such as /proc tracing, profiling, signals, preemption, etc.
 528  *
 529  * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
 530  * Any condition requiring pre-syscall handling must set one of these.
 531  * If the condition is persistent, this routine will repost t_post_sys.
 532  */
 533 void
 534 post_syscall(long rval1, long rval2)
 535 {
 536         kthread_t       *t = curthread;
 537         proc_t  *p = curproc;
 538         klwp_t  *lwp = ttolwp(t);
 539         struct regs *rp = lwptoregs(lwp);
 540         uint_t  error;
 541         int     code = t->t_sysnum;
 542         int     repost = 0;
 543         int     proc_stop = 0;          /* non-zero if stopping for /proc */
 544         int     sigprof = 0;            /* non-zero if sending SIGPROF */
 545 
 546         t->t_post_sys = 0;
 547 
 548         error = lwp->lwp_errno;
 549 
 550         /*
 551          * Code can be zero if this is a new LWP returning after a forkall(),
 552          * other than the one which matches the one in the parent which called
 553          * forkall().  In these LWPs, skip most of post-syscall activity.
 554          */
 555         if (code == 0)
 556                 goto sig_check;
 557 
 558         /* put out audit record for this syscall */
 559         if (AU_AUDITING()) {
 560                 rval_t  rval;   /* fix audit_finish() someday */
 561 
 562                 /* XX64 -- truncation of 64-bit return values? */
 563                 rval.r_val1 = (int)rval1;
 564                 rval.r_val2 = (int)rval2;
 565                 audit_finish(T_SYSCALL, code, error, &rval);
 566                 repost = 1;
 567         }
 568 
 569         if (curthread->t_pdmsg != NULL) {
 570                 char *m = curthread->t_pdmsg;
 571 
 572                 uprintf("%s", m);
 573                 kmem_free(m, strlen(m) + 1);
 574                 curthread->t_pdmsg = NULL;
 575         }
 576 
 577         /*
 578          * If we're going to stop for /proc tracing, set the flag and
 579          * save the arguments so that the return values don't smash them.
 580          */
 581         if (PTOU(p)->u_systrap) {
 582                 if (prismember(&PTOU(p)->u_exitmask, code)) {
 583                         proc_stop = 1;
 584                         (void) save_syscall_args();
 585                 }
 586                 repost = 1;
 587         }
 588 
 589         /*
 590          * Similarly check to see if SIGPROF might be sent.
 591          */
 592         if (curthread->t_rprof != NULL &&
 593             curthread->t_rprof->rp_anystate != 0) {
 594                 (void) save_syscall_args();
 595                 sigprof = 1;
 596         }
 597 
 598         if (lwp->lwp_eosys == NORMALRETURN) {
 599                 if (error == 0) {
 600 #ifdef SYSCALLTRACE
 601                         if (syscalltrace) {
 602                                 mutex_enter(&systrace_lock);
 603                                 printf(
 604                                     "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
 605                                     p->p_pid, rval1, rval2, curthread);
 606                                 mutex_exit(&systrace_lock);
 607                         }
 608 #endif /* SYSCALLTRACE */
 609                         rp->r_tstate &= ~TSTATE_IC;
 610                         rp->r_o0 = rval1;
 611                         rp->r_o1 = rval2;
 612                 } else {
 613                         int sig;
 614 
 615 #ifdef SYSCALLTRACE
 616                         if (syscalltrace) {
 617                                 mutex_enter(&systrace_lock);
 618                                 printf("%d: error=%d, id 0x%p\n",
 619                                     p->p_pid, error, curthread);
 620                                 mutex_exit(&systrace_lock);
 621                         }
 622 #endif /* SYSCALLTRACE */
 623                         if (error == EINTR && t->t_activefd.a_stale)
 624                                 error = EBADF;
 625                         if (error == EINTR &&
 626                             (sig = lwp->lwp_cursig) != 0 &&
 627                             sigismember(&PTOU(p)->u_sigrestart, sig) &&
 628                             PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
 629                             PTOU(p)->u_signal[sig - 1] != SIG_IGN)
 630                                 error = ERESTART;
 631                         rp->r_o0 = error;
 632                         rp->r_tstate |= TSTATE_IC;
 633                 }
 634                 /*
 635                  * The default action is to redo the trap instruction.
 636                  * We increment the pc and npc past it for NORMALRETURN.
 637                  * JUSTRETURN has set up a new pc and npc already.
 638                  * If we are a cloned thread of forkall(), don't
 639                  * adjust here because we have already inherited
 640                  * the adjusted values from our clone.
 641                  */
 642                 if (!(t->t_flag & T_FORKALL)) {
 643                         rp->r_pc = rp->r_npc;
 644                         rp->r_npc += 4;
 645                 }
 646         }
 647 
 648         /*
 649          * From the proc(4) manual page:
 650          * When exit from a system call is being traced, the traced process
 651          * stops on completion of the system call just prior to checking for
 652          * signals and returning to user level.  At this point all return
 653          * values have been stored into the traced process's saved registers.
 654          */
 655         if (proc_stop) {
 656                 mutex_enter(&p->p_lock);
 657                 if (PTOU(p)->u_systrap &&
 658                     prismember(&PTOU(p)->u_exitmask, code))
 659                         stop(PR_SYSEXIT, code);
 660                 mutex_exit(&p->p_lock);
 661         }
 662 
 663         /*
 664          * If we are the parent returning from a successful
 665          * vfork, wait for the child to exec or exit.
 666          * This code must be here and not in the bowels of the system
 667          * so that /proc can intercept exit from vfork in a timely way.
 668          */
 669         if (t->t_flag & T_VFPARENT) {
 670                 ASSERT(code == SYS_vfork || code == SYS_forksys);
 671                 ASSERT(rp->r_o1 == 0 && error == 0);
 672                 vfwait((pid_t)rval1);
 673                 t->t_flag &= ~T_VFPARENT;
 674         }
 675 
 676         /*
 677          * If profiling is active, bill the current PC in user-land
 678          * and keep reposting until profiling is disabled.
 679          */
 680         if (p->p_prof.pr_scale) {
 681                 if (lwp->lwp_oweupc)
 682                         profil_tick(rp->r_pc);
 683                 repost = 1;
 684         }
 685 
 686 sig_check:
 687         /*
 688          * Reset flag for next time.
 689          * We must do this after stopping on PR_SYSEXIT
 690          * because /proc uses the information in lwp_eosys.
 691          */
 692         lwp->lwp_eosys = NORMALRETURN;
 693         clear_stale_fd();
 694         t->t_flag &= ~T_FORKALL;
 695 
 696         if (t->t_astflag | t->t_sig_check) {
 697                 /*
 698                  * Turn off the AST flag before checking all the conditions that
 699                  * may have caused an AST.  This flag is on whenever a signal or
 700                  * unusual condition should be handled after the next trap or
 701                  * syscall.
 702                  */
 703                 astoff(t);
 704                 t->t_sig_check = 0;
 705 
 706                 /*
 707                  * The following check is legal for the following reasons:
 708                  *      1) The thread we are checking, is ourselves, so there is
 709                  *         no way the proc can go away.
 710                  *      2) The only time we need to be protected by the
 711                  *         lock is if the binding is changed.
 712                  *
 713                  *      Note we will still take the lock and check the binding
 714                  *      if the condition was true without the lock held.  This
 715                  *      prevents lock contention among threads owned by the
 716                  *      same proc.
 717                  */
 718 
 719                 if (curthread->t_proc_flag & TP_CHANGEBIND) {
 720                         mutex_enter(&p->p_lock);
 721                         if (curthread->t_proc_flag & TP_CHANGEBIND) {
 722                                 timer_lwpbind();
 723                                 curthread->t_proc_flag &= ~TP_CHANGEBIND;
 724                         }
 725                         mutex_exit(&p->p_lock);
 726                 }
 727 
 728                 /*
 729                  * for kaio requests on the special kaio poll queue,
 730                  * copyout their results to user memory.
 731                  */
 732                 if (p->p_aio)
 733                         aio_cleanup(0);
 734 
 735                 /*
 736                  * If this LWP was asked to hold, call holdlwp(), which will
 737                  * stop.  holdlwps() sets this up and calls pokelwps() which
 738                  * sets the AST flag.
 739                  *
 740                  * Also check TP_EXITLWP, since this is used by fresh new LWPs
 741                  * through lwp_rtt().  That flag is set if the lwp_create(2)
 742                  * syscall failed after creating the LWP.
 743                  */
 744                 if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
 745                         holdlwp();
 746 
 747                 /*
 748                  * All code that sets signals and makes ISSIG_PENDING
 749                  * evaluate true must set t_sig_check afterwards.
 750                  */
 751                 if (ISSIG_PENDING(t, lwp, p)) {
 752                         if (issig(FORREAL))
 753                                 psig();
 754                         t->t_sig_check = 1;  /* recheck next time */
 755                 }
 756 
 757                 if (sigprof) {
 758                         int nargs = (code > 0 && code < NSYSCALL)?
 759                             LWP_GETSYSENT(lwp)[code].sy_narg : 0;
 760                         realsigprof(code, nargs, error);
 761                         t->t_sig_check = 1;  /* recheck next time */
 762                 }
 763 
 764                 /*
 765                  * If a performance counter overflow interrupt was
 766                  * delivered *during* the syscall, then re-enable the
 767                  * AST so that we take a trip through trap() to cause
 768                  * the SIGEMT to be delivered.
 769                  */
 770                 if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
 771                         aston(t);
 772 
 773                 /*
 774                  * If an asynchronous hardware error is pending, turn AST flag
 775                  * back on.  AST will be checked again before we return to user
 776                  * mode and we'll come back through trap() to handle the error.
 777                  */
 778                 if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR)
 779                         aston(t);
 780         }
 781 
 782         /*
 783          * Restore register window if a debugger modified it.
 784          * Set up to perform a single-step if a debugger requested it.
 785          */
 786         if (lwp->lwp_pcb.pcb_xregstat != XREGNONE)
 787                 xregrestore(lwp, 1);
 788 
 789         lwp->lwp_errno = 0;          /* clear error for next time */
 790 
 791 #ifndef NPROBE
 792         /* Kernel probe */
 793         if (tnf_tracing_active) {
 794                 TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
 795                     tnf_long,   rval1,          rval1,
 796                     tnf_long,   rval2,          rval2,
 797                     tnf_long,   errno,          (long)error);
 798                 repost = 1;
 799         }
 800 #endif /* NPROBE */
 801 
 802         /*
 803          * Set state to LWP_USER here so preempt won't give us a kernel
 804          * priority if it occurs after this point.  Call CL_TRAPRET() to
 805          * restore the user-level priority.
 806          *
 807          * It is important that no locks (other than spinlocks) be entered
 808          * after this point before returning to user mode (unless lwp_state
 809          * is set back to LWP_SYS).
 810          *
 811          * Sampled times past this point are charged to the user.
 812          */
 813         lwp->lwp_state = LWP_USER;
 814 
 815         if (t->t_trapret) {
 816                 t->t_trapret = 0;
 817                 thread_lock(t);
 818                 CL_TRAPRET(t);
 819                 thread_unlock(t);
 820         }
 821         if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
 822                 preempt();
 823         prunstop();
 824 
 825         /*
 826          * t_post_sys will be set if pcb_step is active.
 827          */
 828         if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
 829                 prdostep();
 830                 repost = 1;
 831         }
 832 
 833         t->t_sysnum = 0;     /* no longer in a system call */
 834 
 835         /*
 836          * In case the args were copied to the lwp, reset the
 837          * pointer so the next syscall will have the right lwp_ap pointer.
 838          */
 839         lwp->lwp_ap = (long *)&rp->r_o0;
 840         lwp->lwp_argsaved = 0;
 841 
 842         /*
 843          * If there was a continuing reason for post-syscall processing,
 844          * set the t_post_sys flag for the next system call.
 845          */
 846         if (repost)
 847                 t->t_post_sys = 1;
 848 
 849         /*
 850          * If there is a ustack registered for this lwp, and the stack rlimit
 851          * has been altered, read in the ustack. If the saved stack rlimit
 852          * matches the bounds of the ustack, update the ustack to reflect
 853          * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
 854          * stack checking by setting the size to 0.
 855          */
 856         if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
 857                 rlim64_t new_size;
 858                 model_t model;
 859                 caddr_t top;
 860                 struct rlimit64 rl;
 861 
 862                 mutex_enter(&p->p_lock);
 863                 new_size = p->p_stk_ctl;
 864                 model = p->p_model;
 865                 top = p->p_usrstack;
 866                 (void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
 867                 mutex_exit(&p->p_lock);
 868 
 869                 if (rl.rlim_cur == RLIM64_INFINITY)
 870                         new_size = 0;
 871 
 872                 if (model == DATAMODEL_NATIVE) {
 873                         stack_t stk;
 874 
 875                         if (copyin((stack_t *)lwp->lwp_ustack, &stk,
 876                             sizeof (stack_t)) == 0 &&
 877                             (stk.ss_size == lwp->lwp_old_stk_ctl ||
 878                             stk.ss_size == 0) &&
 879                             stk.ss_sp == top - stk.ss_size) {
 880                                 stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
 881                                     stk.ss_size - new_size);
 882                                 stk.ss_size = new_size;
 883 
 884                                 (void) copyout(&stk,
 885                                     (stack_t *)lwp->lwp_ustack,
 886                                     sizeof (stack_t));
 887                         }
 888                 } else {
 889                         stack32_t stk32;
 890 
 891                         if (copyin((stack32_t *)lwp->lwp_ustack, &stk32,
 892                             sizeof (stack32_t)) == 0 &&
 893                             (stk32.ss_size == lwp->lwp_old_stk_ctl ||
 894                             stk32.ss_size == 0) &&
 895                             stk32.ss_sp ==
 896                             (caddr32_t)(uintptr_t)(top - stk32.ss_size)) {
 897                                 stk32.ss_sp += stk32.ss_size - new_size;
 898                                 stk32.ss_size = new_size;
 899 
 900                                 (void) copyout(&stk32,
 901                                     (stack32_t *)lwp->lwp_ustack,
 902                                     sizeof (stack32_t));
 903                         }
 904                 }
 905 
 906                 lwp->lwp_old_stk_ctl = 0;
 907         }
 908 
 909         syscall_mstate(LMS_SYSTEM, LMS_USER);
 910 }
 911 
 912 /*
 913  * Call a system call which takes a pointer to the user args struct and
 914  * a pointer to the return values.  This is a bit slower than the standard
 915  * C arg-passing method in some cases.
 916  */
 917 int64_t
 918 syscall_ap()
 919 {
 920         uint_t  error;
 921         struct sysent *callp;
 922         rval_t  rval;
 923         klwp_t  *lwp = ttolwp(curthread);
 924         struct regs *rp = lwptoregs(lwp);
 925 
 926         callp = LWP_GETSYSENT(lwp) + curthread->t_sysnum;
 927 
 928         /*
 929          * If the arguments don't fit in registers %o0 - o5, make sure they
 930          * have been copied to the lwp_arg array.
 931          */
 932         if (callp->sy_narg > 6 && save_syscall_args())
 933                 return ((int64_t)set_errno(EFAULT));
 934 
 935         rval.r_val1 = 0;
 936         rval.r_val2 = (int)rp->r_o1;
 937         lwp->lwp_error = 0;  /* for old drivers */
 938         error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
 939         if (error)
 940                 return ((int64_t)set_errno(error));
 941         return (rval.r_vals);
 942 }
 943 
 944 /*
 945  * Load system call module.
 946  *      Returns with pointer to held read lock for module.
 947  */
 948 static krwlock_t *
 949 lock_syscall(struct sysent *table, uint_t code)
 950 {
 951         krwlock_t       *module_lock;
 952         struct modctl   *modp;
 953         int             id;
 954         struct sysent   *callp;
 955 
 956         module_lock = table[code].sy_lock;
 957         callp = &table[code];
 958 
 959         /*
 960          * Optimization to only call modload if we don't have a loaded
 961          * syscall.
 962          */
 963         rw_enter(module_lock, RW_READER);
 964         if (LOADED_SYSCALL(callp))
 965                 return (module_lock);
 966         rw_exit(module_lock);
 967 
 968         for (;;) {
 969                 if ((id = modload("sys", syscallnames[code])) == -1)
 970                         break;
 971 
 972                 /*
 973                  * If we loaded successfully at least once, the modctl
 974                  * will still be valid, so we try to grab it by filename.
 975                  * If this call fails, it's because the mod_filename
 976                  * was changed after the call to modload() (mod_hold_by_name()
 977                  * is the likely culprit).  We can safely just take
 978                  * another lap if this is the case;  the modload() will
 979                  * change the mod_filename back to one by which we can
 980                  * find the modctl.
 981                  */
 982                 modp = mod_find_by_filename("sys", syscallnames[code]);
 983 
 984                 if (modp == NULL)
 985                         continue;
 986 
 987                 mutex_enter(&mod_lock);
 988 
 989                 if (!modp->mod_installed) {
 990                         mutex_exit(&mod_lock);
 991                         continue;
 992                 }
 993                 break;
 994         }
 995 
 996         rw_enter(module_lock, RW_READER);
 997 
 998         if (id != -1)
 999                 mutex_exit(&mod_lock);
1000 
1001         return (module_lock);
1002 }
1003 
1004 /*
1005  * Loadable syscall support.
1006  *      If needed, load the module, then reserve it by holding a read
1007  *      lock for the duration of the call.
1008  *      Later, if the syscall is not unloadable, it could patch the vector.
1009  */
1010 /*ARGSUSED*/
1011 int64_t
1012 loadable_syscall(
1013     long a0, long a1, long a2, long a3,
1014     long a4, long a5, long a6, long a7)
1015 {
1016         int64_t         rval;
1017         struct sysent   *callp;
1018         struct sysent   *se = LWP_GETSYSENT(ttolwp(curthread));
1019         krwlock_t       *module_lock;
1020         int             code;
1021 
1022         code = curthread->t_sysnum;
1023         callp = se + code;
1024 
1025         /*
1026          * Try to autoload the system call if necessary.
1027          */
1028         module_lock = lock_syscall(se, code);
1029         THREAD_KPRI_RELEASE();  /* drop priority given by rw_enter */
1030 
1031         /*
1032          * we've locked either the loaded syscall or nosys
1033          */
1034         if (callp->sy_flags & SE_ARGC) {
1035                 int64_t (*sy_call)();
1036 
1037                 sy_call = (int64_t (*)())callp->sy_call;
1038                 rval = (*sy_call)(a0, a1, a2, a3, a4, a5);
1039         } else {
1040                 rval = syscall_ap();
1041         }
1042 
1043         THREAD_KPRI_REQUEST();  /* regain priority from read lock */
1044         rw_exit(module_lock);
1045         return (rval);
1046 }
1047 
1048 /*
1049  * Handle indirect system calls.
1050  *      This interface should be deprecated.  The library can handle
1051  *      this more efficiently, but keep this implementation for old binaries.
1052  *
1053  * XX64 Needs some work.
1054  */
1055 int64_t
1056 indir(int code, long a0, long a1, long a2, long a3, long a4)
1057 {
1058         klwp_t          *lwp = ttolwp(curthread);
1059         struct sysent   *callp;
1060 
1061         if (code <= 0 || code >= NSYSCALL)
1062                 return (nosys());
1063 
1064         ASSERT(lwp->lwp_ap != NULL);
1065 
1066         curthread->t_sysnum = code;
1067         callp = LWP_GETSYSENT(lwp) + code;
1068 
1069         /*
1070          * Handle argument setup, unless already done in pre_syscall().
1071          */
1072         if (callp->sy_narg > 5) {
1073                 if (save_syscall_args())        /* move args to LWP array */
1074                         return ((int64_t)set_errno(EFAULT));
1075         } else if (!lwp->lwp_argsaved) {
1076                 long *ap;
1077 
1078                 ap = lwp->lwp_ap;            /* args haven't been saved */
1079                 lwp->lwp_ap = ap + 1;                /* advance arg pointer */
1080                 curthread->t_post_sys = 1;   /* so lwp_ap will be reset */
1081         }
1082         return ((*callp->sy_callc)(a0, a1, a2, a3, a4, lwp->lwp_arg[5]));
1083 }
1084 
1085 /*
1086  * set_errno - set an error return from the current system call.
1087  *      This could be a macro.
1088  *      This returns the value it is passed, so that the caller can
1089  *      use tail-recursion-elimination and do return (set_errno(ERRNO));
1090  */
1091 uint_t
1092 set_errno(uint_t error)
1093 {
1094         ASSERT(error != 0);             /* must not be used to clear errno */
1095 
1096         curthread->t_post_sys = 1;   /* have post_syscall do error return */
1097         return (ttolwp(curthread)->lwp_errno = error);
1098 }
1099 
1100 /*
1101  * set_proc_pre_sys - Set pre-syscall processing for entire process.
1102  */
1103 void
1104 set_proc_pre_sys(proc_t *p)
1105 {
1106         kthread_t       *t;
1107         kthread_t       *first;
1108 
1109         ASSERT(MUTEX_HELD(&p->p_lock));
1110 
1111         t = first = p->p_tlist;
1112         do {
1113                 t->t_pre_sys = 1;
1114         } while ((t = t->t_forw) != first);
1115 }
1116 
1117 /*
1118  * set_proc_post_sys - Set post-syscall processing for entire process.
1119  */
1120 void
1121 set_proc_post_sys(proc_t *p)
1122 {
1123         kthread_t       *t;
1124         kthread_t       *first;
1125 
1126         ASSERT(MUTEX_HELD(&p->p_lock));
1127 
1128         t = first = p->p_tlist;
1129         do {
1130                 t->t_post_sys = 1;
1131         } while ((t = t->t_forw) != first);
1132 }
1133 
1134 /*
1135  * set_proc_sys - Set pre- and post-syscall processing for entire process.
1136  */
1137 void
1138 set_proc_sys(proc_t *p)
1139 {
1140         kthread_t       *t;
1141         kthread_t       *first;
1142 
1143         ASSERT(MUTEX_HELD(&p->p_lock));
1144 
1145         t = first = p->p_tlist;
1146         do {
1147                 t->t_pre_sys = 1;
1148                 t->t_post_sys = 1;
1149         } while ((t = t->t_forw) != first);
1150 }
1151 
1152 /*
1153  * set_all_proc_sys - set pre- and post-syscall processing flags for all
1154  * user processes.
1155  *
1156  * This is needed when auditing, tracing, or other facilities which affect
1157  * all processes are turned on.
1158  */
1159 void
1160 set_all_proc_sys()
1161 {
1162         kthread_t       *t;
1163         kthread_t       *first;
1164 
1165         mutex_enter(&pidlock);
1166         t = first = curthread;
1167         do {
1168                 t->t_pre_sys = 1;
1169                 t->t_post_sys = 1;
1170         } while ((t = t->t_next) != first);
1171         mutex_exit(&pidlock);
1172 }
1173 
1174 /*
1175  * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for
1176  * all user processes running in the zone of the current process
1177  *
1178  * This is needed when auditing is turned on.
1179  */
1180 void
1181 set_all_zone_usr_proc_sys(zoneid_t zoneid)
1182 {
1183         proc_t      *p;
1184         kthread_t   *t;
1185 
1186         mutex_enter(&pidlock);
1187         for (p = practive; p != NULL; p = p->p_next) {
1188                 /* skip kernel processes */
1189                 if (p->p_exec == NULLVP || p->p_as == &kas ||
1190                     p->p_stat == SIDL || p->p_stat == SZOMB ||
1191                     (p->p_flag & (SSYS | SEXITING | SEXITLWPS)))
1192                         continue;
1193                 /*
1194                  * Only processes in the given zone (eventually in
1195                  * all zones) are taken into account
1196                  */
1197                 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) {
1198                         mutex_enter(&p->p_lock);
1199                         if ((t = p->p_tlist) == NULL) {
1200                                 mutex_exit(&p->p_lock);
1201                                 continue;
1202                         }
1203                         /*
1204                          * Set pre- and post-syscall processing flags
1205                          * for all threads of the process
1206                          */
1207                         do {
1208                                 t->t_pre_sys = 1;
1209                                 t->t_post_sys = 1;
1210                         } while (p->p_tlist != (t = t->t_forw));
1211                         mutex_exit(&p->p_lock);
1212                 }
1213         }
1214         mutex_exit(&pidlock);
1215 }
1216 
1217 /*
1218  * set_proc_ast - Set asynchronous service trap (AST) flag for all
1219  * threads in process.
1220  */
1221 void
1222 set_proc_ast(proc_t *p)
1223 {
1224         kthread_t       *t;
1225         kthread_t       *first;
1226 
1227         ASSERT(MUTEX_HELD(&p->p_lock));
1228 
1229         t = first = p->p_tlist;
1230         do {
1231                 aston(t);
1232         } while ((t = t->t_forw) != first);
1233 }