1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
  24  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  25  */
  26 
  27 #include <errno.h>
  28 #include <fcntl.h>
  29 #include <dirent.h>
  30 #include <stddef.h>
  31 #include <stdio.h>
  32 #include <stdlib.h>
  33 #include <strings.h>
  34 #include <unistd.h>
  35 #include <thread.h>
  36 #include <sys/auxv.h>
  37 #include <sys/brand.h>
  38 #include <sys/inttypes.h>
  39 #include <sys/lwp.h>
  40 #include <sys/syscall.h>
  41 #include <sys/systm.h>
  42 #include <sys/utsname.h>
  43 #include <sys/sysconfig.h>
  44 #include <sys/systeminfo.h>
  45 #include <sys/zone.h>
  46 #include <sys/stat.h>
  47 #include <sys/mntent.h>
  48 #include <sys/ctfs.h>
  49 #include <sys/priv.h>
  50 #include <sys/acctctl.h>
  51 #include <libgen.h>
  52 #include <bsm/audit.h>
  53 #include <sys/crypto/ioctl.h>
  54 #include <sys/fs/zfs.h>
  55 #include <sys/zfs_ioctl.h>
  56 #include <sys/ucontext.h>
  57 #include <sys/mntio.h>
  58 #include <sys/mnttab.h>
  59 #include <sys/attr.h>
  60 #include <sys/lofi.h>
  61 #include <sys/mkdev.h>
  62 #include <atomic.h>
  63 #include <sys/acl.h>
  64 #include <sys/socket.h>
  65 
  66 #include <s10_brand.h>
  67 #include <brand_misc.h>
  68 #include <s10_misc.h>
  69 #include <s10_signal.h>
  70 
  71 /*
  72  * See usr/src/lib/brand/shared/brand/common/brand_util.c for general
  73  * emulation notes.
  74  */
  75 
  76 static zoneid_t zoneid;
  77 static boolean_t emul_global_zone = B_FALSE;
  78 static s10_emul_bitmap_t emul_bitmap;
  79 pid_t zone_init_pid;
  80 
  81 /*
  82  * S10_FEATURE_IS_PRESENT is a macro that helps facilitate conditional
  83  * emulation.  For each constant N defined in the s10_emulated_features
  84  * enumeration in usr/src/uts/common/brand/solaris10/s10_brand.h,
  85  * S10_FEATURE_IS_PRESENT(N) is true iff the feature/backport represented by N
  86  * is present in the Solaris 10 image hosted within the zone.  In other words,
  87  * S10_FEATURE_IS_PRESENT(N) is true iff the file /usr/lib/brand/solaris10/M,
  88  * where M is the enum value of N, was present in the zone when the zone booted.
  89  *
  90  *
  91  * *** Sample Usage
  92  *
  93  * Suppose that you need to backport a fix to Solaris 10 and there is
  94  * emulation in place for the fix.  Suppose further that the emulation won't be
  95  * needed if the fix is backported (i.e., if the fix is present in the hosted
  96  * Solaris 10 environment, then the brand won't need the emulation).  Then if
  97  * you add a constant named "S10_FEATURE_X" to the end of the
  98  * s10_emulated_features enumeration that represents the backported fix and
  99  * S10_FEATURE_X evaluates to four, then you should create a file named
 100  * /usr/lib/brand/solaris10/4 as part of your backport.  Additionally, you
 101  * should retain the aforementioned emulation but modify it so that it's
 102  * performed only when S10_FEATURE_IS_PRESENT(S10_FEATURE_X) is false.  Thus the
 103  * emulation function should look something like the following:
 104  *
 105  *      static int
 106  *      my_emul_function(sysret_t *rv, ...)
 107  *      {
 108  *              if (S10_FEATURE_IS_PRESENT(S10_FEATURE_X)) {
 109  *                      // Don't emulate
 110  *                      return (__systemcall(rv, ...));
 111  *              } else {
 112  *                      // Emulate whatever needs to be emulated when the
 113  *                      // backport isn't present in the Solaris 10 image.
 114  *              }
 115  *      }
 116  */
 117 #define S10_FEATURE_IS_PRESENT(s10_emulated_features_constant)  \
 118         ((emul_bitmap[(s10_emulated_features_constant) >> 3] &        \
 119         (1 << ((s10_emulated_features_constant) & 0x7))) != 0)
 120 
 121 brand_sysent_table_t brand_sysent_table[];
 122 
 123 #define S10_UTS_RELEASE "5.10"
 124 #define S10_UTS_VERSION "Generic_Virtual"
 125 
 126 /*
 127  * If the ioctl fd's major doesn't match "major", then pass through the
 128  * ioctl, since it is not the expected device.  major should be a
 129  * pointer to a static dev_t initialized to -1, and devname should be
 130  * the path of the device.
 131  *
 132  * Returns 1 if the ioctl was handled (in which case *err contains the
 133  * error code), or 0 if it still needs handling.
 134  */
 135 static int
 136 passthru_otherdev_ioctl(dev_t *majordev, const char *devname, int *err,
 137     sysret_t *rval, int fdes, int cmd, intptr_t arg)
 138 {
 139         struct stat sbuf;
 140 
 141         if (*majordev == (dev_t)-1) {
 142                 if ((*err = __systemcall(rval, SYS_fstatat + 1024,
 143                     AT_FDCWD, devname, &sbuf, 0) != 0) != 0)
 144                         goto doioctl;
 145 
 146                 *majordev = major(sbuf.st_rdev);
 147         }
 148 
 149         if ((*err = __systemcall(rval, SYS_fstatat + 1024, fdes,
 150             NULL, &sbuf, 0)) != 0)
 151                 goto doioctl;
 152 
 153         if (major(sbuf.st_rdev) == *majordev)
 154                 return (0);
 155 
 156 doioctl:
 157         *err = (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
 158         return (1);
 159 }
 160 
 161 /*
 162  * Figures out the PID of init for the zone.  Also returns a boolean
 163  * indicating whether this process currently has that pid: if so,
 164  * then at this moment, we are init.
 165  */
 166 static boolean_t
 167 get_initpid_info(void)
 168 {
 169         pid_t pid;
 170         sysret_t rval;
 171         int err;
 172 
 173         /*
 174          * Determine the current process PID and the PID of the zone's init.
 175          * We use care not to call getpid() here, because we're not supposed
 176          * to call getpid() until after the program is fully linked-- the
 177          * first call to getpid() is a signal from the linker to debuggers
 178          * that linking has been completed.
 179          */
 180         if ((err = __systemcall(&rval, SYS_brand,
 181             B_S10_PIDINFO, &pid, &zone_init_pid)) != 0) {
 182                 brand_abort(err, "Failed to get init's pid");
 183         }
 184 
 185         /*
 186          * Note that we need to be cautious with the pid we get back--
 187          * it should not be stashed and used in place of getpid(), since
 188          * we might fork(2).  So we keep zone_init_pid and toss the pid
 189          * we otherwise got.
 190          */
 191         if (pid == zone_init_pid)
 192                 return (B_TRUE);
 193 
 194         return (B_FALSE);
 195 }
 196 
 197 /* Free the thread-local storage provided by mntfs_get_mntentbuf(). */
 198 static void
 199 mntfs_free_mntentbuf(void *arg)
 200 {
 201         struct mntentbuf *embufp = arg;
 202 
 203         if (embufp == NULL)
 204                 return;
 205         if (embufp->mbuf_emp)
 206                 free(embufp->mbuf_emp);
 207         if (embufp->mbuf_buf)
 208                 free(embufp->mbuf_buf);
 209         bzero(embufp, sizeof (struct mntentbuf));
 210         free(embufp);
 211 }
 212 
 213 /* Provide the thread-local storage required by mntfs_ioctl(). */
 214 static struct mntentbuf *
 215 mntfs_get_mntentbuf(size_t size)
 216 {
 217         static mutex_t keylock;
 218         static thread_key_t key;
 219         static int once_per_keyname = 0;
 220         void *tsd = NULL;
 221         struct mntentbuf *embufp;
 222 
 223         /* Create the key. */
 224         if (!once_per_keyname) {
 225                 (void) mutex_lock(&keylock);
 226                 if (!once_per_keyname) {
 227                         if (thr_keycreate(&key, mntfs_free_mntentbuf)) {
 228                                 (void) mutex_unlock(&keylock);
 229                                 return (NULL);
 230                         } else {
 231                                 once_per_keyname++;
 232                         }
 233                 }
 234                 (void) mutex_unlock(&keylock);
 235         }
 236 
 237         /*
 238          * The thread-specific datum for this key is the address of a struct
 239          * mntentbuf. If this is the first time here then we allocate the struct
 240          * and its contents, and associate its address with the thread; if there
 241          * are any problems then we abort.
 242          */
 243         if (thr_getspecific(key, &tsd))
 244                 return (NULL);
 245         if (tsd == NULL) {
 246                 if (!(embufp = calloc(1, sizeof (struct mntentbuf))) ||
 247                     !(embufp->mbuf_emp = malloc(sizeof (struct extmnttab))) ||
 248                     thr_setspecific(key, embufp)) {
 249                         mntfs_free_mntentbuf(embufp);
 250                         return (NULL);
 251                 }
 252         } else {
 253                 embufp = tsd;
 254         }
 255 
 256         /* Return the buffer, resizing it if necessary. */
 257         if (size > embufp->mbuf_bufsize) {
 258                 if (embufp->mbuf_buf)
 259                         free(embufp->mbuf_buf);
 260                 if ((embufp->mbuf_buf = malloc(size)) == NULL) {
 261                         embufp->mbuf_bufsize = 0;
 262                         return (NULL);
 263                 } else {
 264                         embufp->mbuf_bufsize = size;
 265                 }
 266         }
 267         return (embufp);
 268 }
 269 
 270 /*
 271  * The MNTIOC_GETMNTENT command in this release differs from that in early
 272  * versions of Solaris 10.
 273  *
 274  * Previously, the command would copy a pointer to a struct extmnttab to an
 275  * address provided as an argument. The pointer would be somewhere within a
 276  * mapping already present within the user's address space. In addition, the
 277  * text to which the struct's members pointed would also be within a
 278  * pre-existing mapping. Now, the user is required to allocate memory for both
 279  * the struct and the text buffer, and to pass the address of each within a
 280  * struct mntentbuf. In order to conceal these details from a Solaris 10 client
 281  * we allocate some thread-local storage in which to create the necessary data
 282  * structures; this is static, thread-safe memory that will be cleaned up
 283  * without the caller's intervention.
 284  *
 285  * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY are new in this release; they should
 286  * not work for older clients.
 287  */
 288 int
 289 mntfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
 290 {
 291         int err;
 292         struct stat statbuf;
 293         struct mntentbuf *embufp;
 294         static size_t bufsize = MNT_LINE_MAX;
 295 
 296         /* Do not emulate mntfs commands from up-to-date clients. */
 297         if (S10_FEATURE_IS_PRESENT(S10_FEATURE_ALTERED_MNTFS_IOCTL))
 298                 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
 299 
 300         /* Do not emulate mntfs commands directed at other file systems. */
 301         if ((err = __systemcall(rval, SYS_fstatat + 1024,
 302             fdes, NULL, &statbuf, 0)) != 0)
 303                 return (err);
 304         if (strcmp(statbuf.st_fstype, MNTTYPE_MNTFS) != 0)
 305                 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
 306 
 307         if (cmd == MNTIOC_GETEXTMNTENT || cmd == MNTIOC_GETMNTANY)
 308                 return (EINVAL);
 309 
 310         if ((embufp = mntfs_get_mntentbuf(bufsize)) == NULL)
 311                 return (ENOMEM);
 312 
 313         /*
 314          * MNTIOC_GETEXTMNTENT advances the file pointer once it has
 315          * successfully copied out the result to the address provided. We
 316          * therefore need to check the user-supplied address now since the
 317          * one we'll be providing is guaranteed to work.
 318          */
 319         if (brand_uucopy(&embufp->mbuf_emp, (void *)arg, sizeof (void *)) != 0)
 320                 return (EFAULT);
 321 
 322         /*
 323          * Keep retrying for as long as we fail for want of a large enough
 324          * buffer.
 325          */
 326         for (;;) {
 327                 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes,
 328                     MNTIOC_GETEXTMNTENT, embufp)) != 0)
 329                         return (err);
 330 
 331                 if (rval->sys_rval1 == MNTFS_TOOLONG) {
 332                         /* The buffer wasn't large enough. */
 333                         (void) atomic_swap_ulong((unsigned long *)&bufsize,
 334                             2 * embufp->mbuf_bufsize);
 335                         if ((embufp = mntfs_get_mntentbuf(bufsize)) == NULL)
 336                                 return (ENOMEM);
 337                 } else {
 338                         break;
 339                 }
 340         }
 341 
 342         if (brand_uucopy(&embufp->mbuf_emp, (void *)arg, sizeof (void *)) != 0)
 343                 return (EFAULT);
 344 
 345         return (0);
 346 }
 347 
 348 /*
 349  * Assign the structure member value from the s (source) structure to the
 350  * d (dest) structure.
 351  */
 352 #define struct_assign(d, s, val)        (((d).val) = ((s).val))
 353 
 354 /*
 355  * The CRYPTO_GET_FUNCTION_LIST parameter structure crypto_function_list_t
 356  * changed between S10 and Nevada, so we have to emulate the old S10
 357  * crypto_function_list_t structure when interposing on the ioctl syscall.
 358  */
 359 typedef struct s10_crypto_function_list {
 360         boolean_t fl_digest_init;
 361         boolean_t fl_digest;
 362         boolean_t fl_digest_update;
 363         boolean_t fl_digest_key;
 364         boolean_t fl_digest_final;
 365 
 366         boolean_t fl_encrypt_init;
 367         boolean_t fl_encrypt;
 368         boolean_t fl_encrypt_update;
 369         boolean_t fl_encrypt_final;
 370 
 371         boolean_t fl_decrypt_init;
 372         boolean_t fl_decrypt;
 373         boolean_t fl_decrypt_update;
 374         boolean_t fl_decrypt_final;
 375 
 376         boolean_t fl_mac_init;
 377         boolean_t fl_mac;
 378         boolean_t fl_mac_update;
 379         boolean_t fl_mac_final;
 380 
 381         boolean_t fl_sign_init;
 382         boolean_t fl_sign;
 383         boolean_t fl_sign_update;
 384         boolean_t fl_sign_final;
 385         boolean_t fl_sign_recover_init;
 386         boolean_t fl_sign_recover;
 387 
 388         boolean_t fl_verify_init;
 389         boolean_t fl_verify;
 390         boolean_t fl_verify_update;
 391         boolean_t fl_verify_final;
 392         boolean_t fl_verify_recover_init;
 393         boolean_t fl_verify_recover;
 394 
 395         boolean_t fl_digest_encrypt_update;
 396         boolean_t fl_decrypt_digest_update;
 397         boolean_t fl_sign_encrypt_update;
 398         boolean_t fl_decrypt_verify_update;
 399 
 400         boolean_t fl_seed_random;
 401         boolean_t fl_generate_random;
 402 
 403         boolean_t fl_session_open;
 404         boolean_t fl_session_close;
 405         boolean_t fl_session_login;
 406         boolean_t fl_session_logout;
 407 
 408         boolean_t fl_object_create;
 409         boolean_t fl_object_copy;
 410         boolean_t fl_object_destroy;
 411         boolean_t fl_object_get_size;
 412         boolean_t fl_object_get_attribute_value;
 413         boolean_t fl_object_set_attribute_value;
 414         boolean_t fl_object_find_init;
 415         boolean_t fl_object_find;
 416         boolean_t fl_object_find_final;
 417 
 418         boolean_t fl_key_generate;
 419         boolean_t fl_key_generate_pair;
 420         boolean_t fl_key_wrap;
 421         boolean_t fl_key_unwrap;
 422         boolean_t fl_key_derive;
 423 
 424         boolean_t fl_init_token;
 425         boolean_t fl_init_pin;
 426         boolean_t fl_set_pin;
 427 
 428         boolean_t prov_is_hash_limited;
 429         uint32_t prov_hash_threshold;
 430         uint32_t prov_hash_limit;
 431 } s10_crypto_function_list_t;
 432 
 433 typedef struct s10_crypto_get_function_list {
 434         uint_t                          fl_return_value;
 435         crypto_provider_id_t            fl_provider_id;
 436         s10_crypto_function_list_t      fl_list;
 437 } s10_crypto_get_function_list_t;
 438 
 439 /*
 440  * The structure returned by the CRYPTO_GET_FUNCTION_LIST ioctl on /dev/crypto
 441  * increased in size due to:
 442  *      6482533 Threshold for HW offload via PKCS11 interface
 443  * between S10 and Nevada.  This is a relatively simple process of filling
 444  * in the S10 structure fields with the Nevada data.
 445  *
 446  * We stat the device to make sure that the ioctl is meant for /dev/crypto.
 447  *
 448  */
 449 static int
 450 crypto_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
 451 {
 452         int                             err;
 453         s10_crypto_get_function_list_t  s10_param;
 454         crypto_get_function_list_t      native_param;
 455         static dev_t                    crypto_dev = (dev_t)-1;
 456 
 457         if (passthru_otherdev_ioctl(&crypto_dev, "/dev/crypto", &err,
 458             rval, fdes, cmd, arg) == 1)
 459                 return (err);
 460 
 461         if (brand_uucopy((const void *)arg, &s10_param, sizeof (s10_param))
 462             != 0)
 463                 return (EFAULT);
 464         struct_assign(native_param, s10_param, fl_provider_id);
 465         if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd,
 466             &native_param)) != 0)
 467                 return (err);
 468 
 469         struct_assign(s10_param, native_param, fl_return_value);
 470         struct_assign(s10_param, native_param, fl_provider_id);
 471 
 472         struct_assign(s10_param, native_param, fl_list.fl_digest_init);
 473         struct_assign(s10_param, native_param, fl_list.fl_digest);
 474         struct_assign(s10_param, native_param, fl_list.fl_digest_update);
 475         struct_assign(s10_param, native_param, fl_list.fl_digest_key);
 476         struct_assign(s10_param, native_param, fl_list.fl_digest_final);
 477 
 478         struct_assign(s10_param, native_param, fl_list.fl_encrypt_init);
 479         struct_assign(s10_param, native_param, fl_list.fl_encrypt);
 480         struct_assign(s10_param, native_param, fl_list.fl_encrypt_update);
 481         struct_assign(s10_param, native_param, fl_list.fl_encrypt_final);
 482 
 483         struct_assign(s10_param, native_param, fl_list.fl_decrypt_init);
 484         struct_assign(s10_param, native_param, fl_list.fl_decrypt);
 485         struct_assign(s10_param, native_param, fl_list.fl_decrypt_update);
 486         struct_assign(s10_param, native_param, fl_list.fl_decrypt_final);
 487 
 488         struct_assign(s10_param, native_param, fl_list.fl_mac_init);
 489         struct_assign(s10_param, native_param, fl_list.fl_mac);
 490         struct_assign(s10_param, native_param, fl_list.fl_mac_update);
 491         struct_assign(s10_param, native_param, fl_list.fl_mac_final);
 492 
 493         struct_assign(s10_param, native_param, fl_list.fl_sign_init);
 494         struct_assign(s10_param, native_param, fl_list.fl_sign);
 495         struct_assign(s10_param, native_param, fl_list.fl_sign_update);
 496         struct_assign(s10_param, native_param, fl_list.fl_sign_final);
 497         struct_assign(s10_param, native_param, fl_list.fl_sign_recover_init);
 498         struct_assign(s10_param, native_param, fl_list.fl_sign_recover);
 499 
 500         struct_assign(s10_param, native_param, fl_list.fl_verify_init);
 501         struct_assign(s10_param, native_param, fl_list.fl_verify);
 502         struct_assign(s10_param, native_param, fl_list.fl_verify_update);
 503         struct_assign(s10_param, native_param, fl_list.fl_verify_final);
 504         struct_assign(s10_param, native_param, fl_list.fl_verify_recover_init);
 505         struct_assign(s10_param, native_param, fl_list.fl_verify_recover);
 506 
 507         struct_assign(s10_param, native_param,
 508             fl_list.fl_digest_encrypt_update);
 509         struct_assign(s10_param, native_param,
 510             fl_list.fl_decrypt_digest_update);
 511         struct_assign(s10_param, native_param, fl_list.fl_sign_encrypt_update);
 512         struct_assign(s10_param, native_param,
 513             fl_list.fl_decrypt_verify_update);
 514 
 515         struct_assign(s10_param, native_param, fl_list.fl_seed_random);
 516         struct_assign(s10_param, native_param, fl_list.fl_generate_random);
 517 
 518         struct_assign(s10_param, native_param, fl_list.fl_session_open);
 519         struct_assign(s10_param, native_param, fl_list.fl_session_close);
 520         struct_assign(s10_param, native_param, fl_list.fl_session_login);
 521         struct_assign(s10_param, native_param, fl_list.fl_session_logout);
 522 
 523         struct_assign(s10_param, native_param, fl_list.fl_object_create);
 524         struct_assign(s10_param, native_param, fl_list.fl_object_copy);
 525         struct_assign(s10_param, native_param, fl_list.fl_object_destroy);
 526         struct_assign(s10_param, native_param, fl_list.fl_object_get_size);
 527         struct_assign(s10_param, native_param,
 528             fl_list.fl_object_get_attribute_value);
 529         struct_assign(s10_param, native_param,
 530             fl_list.fl_object_set_attribute_value);
 531         struct_assign(s10_param, native_param, fl_list.fl_object_find_init);
 532         struct_assign(s10_param, native_param, fl_list.fl_object_find);
 533         struct_assign(s10_param, native_param, fl_list.fl_object_find_final);
 534 
 535         struct_assign(s10_param, native_param, fl_list.fl_key_generate);
 536         struct_assign(s10_param, native_param, fl_list.fl_key_generate_pair);
 537         struct_assign(s10_param, native_param, fl_list.fl_key_wrap);
 538         struct_assign(s10_param, native_param, fl_list.fl_key_unwrap);
 539         struct_assign(s10_param, native_param, fl_list.fl_key_derive);
 540 
 541         struct_assign(s10_param, native_param, fl_list.fl_init_token);
 542         struct_assign(s10_param, native_param, fl_list.fl_init_pin);
 543         struct_assign(s10_param, native_param, fl_list.fl_set_pin);
 544 
 545         struct_assign(s10_param, native_param, fl_list.prov_is_hash_limited);
 546         struct_assign(s10_param, native_param, fl_list.prov_hash_threshold);
 547         struct_assign(s10_param, native_param, fl_list.prov_hash_limit);
 548 
 549         return (brand_uucopy(&s10_param, (void *)arg, sizeof (s10_param)));
 550 }
 551 
 552 /*
 553  * The process contract CT_TGET and CT_TSET parameter structure ct_param_t
 554  * changed between S10 and Nevada, so we have to emulate the old S10
 555  * ct_param_t structure when interposing on the ioctl syscall.
 556  */
 557 typedef struct s10_ct_param {
 558         uint32_t ctpm_id;
 559         uint32_t ctpm_pad;
 560         uint64_t ctpm_value;
 561 } s10_ct_param_t;
 562 
 563 /*
 564  * We have to emulate process contract ioctls for init(1M) because the
 565  * ioctl parameter structure changed between S10 and Nevada.  This is
 566  * a relatively simple process of filling Nevada structure fields,
 567  * shuffling values, and initiating a native system call.
 568  *
 569  * For now, we'll assume that all consumers of CT_TGET and CT_TSET will
 570  * need emulation.  We'll issue a stat to make sure that the ioctl
 571  * is meant for the contract file system.
 572  *
 573  */
 574 static int
 575 ctfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
 576 {
 577         int err;
 578         s10_ct_param_t s10param;
 579         ct_param_t param;
 580         struct stat statbuf;
 581 
 582         if ((err = __systemcall(rval, SYS_fstatat + 1024,
 583             fdes, NULL, &statbuf, 0)) != 0)
 584                 return (err);
 585         if (strcmp(statbuf.st_fstype, MNTTYPE_CTFS) != 0)
 586                 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
 587 
 588         if (brand_uucopy((const void *)arg, &s10param, sizeof (s10param)) != 0)
 589                 return (EFAULT);
 590         param.ctpm_id = s10param.ctpm_id;
 591         param.ctpm_size = sizeof (uint64_t);
 592         param.ctpm_value = &s10param.ctpm_value;
 593         if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd, &param))
 594             != 0)
 595                 return (err);
 596 
 597         if (cmd == CT_TGET)
 598                 return (brand_uucopy(&s10param, (void *)arg,
 599                     sizeof (s10param)));
 600 
 601         return (0);
 602 }
 603 
 604 /*
 605  * ZFS ioctls have changed in each Solaris 10 (S10) release as well as in
 606  * Solaris Next.  The brand wraps ZFS commands so that the native commands
 607  * are used, but we want to be sure no command sneaks in that uses ZFS
 608  * without our knowledge.  We'll abort the process if we see a ZFS ioctl.
 609  */
 610 static int
 611 zfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
 612 {
 613         static dev_t zfs_dev = (dev_t)-1;
 614         int err;
 615 
 616         if (passthru_otherdev_ioctl(&zfs_dev, ZFS_DEV, &err,
 617             rval, fdes, cmd, arg) == 1)
 618                 return (err);
 619 
 620         brand_abort(0, "ZFS ioctl!");
 621         /*NOTREACHED*/
 622         return (0);
 623 }
 624 
 625 struct s10_lofi_ioctl {
 626         uint32_t li_minor;
 627         boolean_t li_force;
 628         char li_filename[MAXPATHLEN + 1];
 629 };
 630 
 631 static int
 632 lofi_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
 633 {
 634         static dev_t lofi_dev = (dev_t)-1;
 635         struct s10_lofi_ioctl s10_param;
 636         struct lofi_ioctl native_param;
 637         int err;
 638 
 639         if (passthru_otherdev_ioctl(&lofi_dev, "/dev/lofictl", &err,
 640             rval, fdes, cmd, arg) == 1)
 641                 return (err);
 642 
 643         if (brand_uucopy((const void *)arg, &s10_param,
 644             sizeof (s10_param)) != 0)
 645                 return (EFAULT);
 646 
 647         /*
 648          * Somewhat weirdly, EIO is what the S10 lofi driver would
 649          * return for unrecognised cmds.
 650          */
 651         if (cmd >= LOFI_CHECK_COMPRESSED)
 652                 return (EIO);
 653 
 654         bzero(&native_param, sizeof (native_param));
 655 
 656         struct_assign(native_param, s10_param, li_minor);
 657         struct_assign(native_param, s10_param, li_force);
 658 
 659         /*
 660          * Careful here, this has changed from [MAXPATHLEN + 1] to
 661          * [MAXPATHLEN].
 662          */
 663         bcopy(s10_param.li_filename, native_param.li_filename,
 664             sizeof (native_param.li_filename));
 665         native_param.li_filename[MAXPATHLEN - 1] = '\0';
 666 
 667         err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd, &native_param);
 668 
 669         struct_assign(s10_param, native_param, li_minor);
 670         /* li_force is input-only */
 671 
 672         bcopy(native_param.li_filename, s10_param.li_filename,
 673             sizeof (native_param.li_filename));
 674 
 675         (void) brand_uucopy(&s10_param, (void *)arg, sizeof (s10_param));
 676         return (err);
 677 }
 678 
 679 int
 680 s10_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
 681 {
 682         switch (cmd) {
 683         case CRYPTO_GET_FUNCTION_LIST:
 684                 return (crypto_ioctl(rval, fdes, cmd, arg));
 685         case CT_TGET:
 686                 /*FALLTHRU*/
 687         case CT_TSET:
 688                 return (ctfs_ioctl(rval, fdes, cmd, arg));
 689         case MNTIOC_GETMNTENT:
 690                 /*FALLTHRU*/
 691         case MNTIOC_GETEXTMNTENT:
 692                 /*FALLTHRU*/
 693         case MNTIOC_GETMNTANY:
 694                 return (mntfs_ioctl(rval, fdes, cmd, arg));
 695         }
 696 
 697         switch (cmd & ~0xff) {
 698         case ZFS_IOC:
 699                 return (zfs_ioctl(rval, fdes, cmd, arg));
 700 
 701         case LOFI_IOC_BASE:
 702                 return (lofi_ioctl(rval, fdes, cmd, arg));
 703 
 704         default:
 705                 break;
 706         }
 707 
 708         return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
 709 }
 710 
 711 /*
 712  * Unfortunately, pwrite()'s behavior differs between S10 and Nevada when
 713  * applied to files opened with O_APPEND.  The offset argument is ignored and
 714  * the buffer is appended to the target file in S10, whereas the current file
 715  * position is ignored in Nevada (i.e., pwrite() acts as though the target file
 716  * wasn't opened with O_APPEND).  This is a result of the fix for CR 6655660
 717  * (pwrite() must ignore the O_APPEND/FAPPEND flag).
 718  *
 719  * We emulate the old S10 pwrite() behavior by checking whether the target file
 720  * was opened with O_APPEND.  If it was, then invoke the write() system call
 721  * instead of pwrite(); otherwise, invoke the pwrite() system call as usual.
 722  */
 723 static int
 724 s10_pwrite(sysret_t *rval, int fd, const void *bufferp, size_t num_bytes,
 725     off_t offset)
 726 {
 727         int err;
 728 
 729         if ((err = __systemcall(rval, SYS_fcntl + 1024, fd, F_GETFL)) != 0)
 730                 return (err);
 731         if (rval->sys_rval1 & O_APPEND)
 732                 return (__systemcall(rval, SYS_write + 1024, fd, bufferp,
 733                     num_bytes));
 734         return (__systemcall(rval, SYS_pwrite + 1024, fd, bufferp, num_bytes,
 735             offset));
 736 }
 737 
 738 #if !defined(_LP64)
 739 /*
 740  * This is the large file version of the pwrite() system call for 32-bit
 741  * processes.  This exists for the same reason that s10_pwrite() exists; see
 742  * the comment above s10_pwrite().
 743  */
 744 static int
 745 s10_pwrite64(sysret_t *rval, int fd, const void *bufferp, size32_t num_bytes,
 746     uint32_t offset_1, uint32_t offset_2)
 747 {
 748         int err;
 749 
 750         if ((err = __systemcall(rval, SYS_fcntl + 1024, fd, F_GETFL)) != 0)
 751                 return (err);
 752         if (rval->sys_rval1 & O_APPEND)
 753                 return (__systemcall(rval, SYS_write + 1024, fd, bufferp,
 754                     num_bytes));
 755         return (__systemcall(rval, SYS_pwrite64 + 1024, fd, bufferp,
 756             num_bytes, offset_1, offset_2));
 757 }
 758 #endif  /* !_LP64 */
 759 
 760 /*
 761  * These are convenience macros that s10_getdents_common() uses.  Both treat
 762  * their arguments, which should be character pointers, as dirent pointers or
 763  * dirent64 pointers and yield their d_name and d_reclen fields.  These
 764  * macros shouldn't be used outside of s10_getdents_common().
 765  */
 766 #define dirent_name(charptr)    ((charptr) + name_offset)
 767 #define dirent_reclen(charptr)  \
 768         (*(unsigned short *)(uintptr_t)((charptr) + reclen_offset))
 769 
 770 /*
 771  * This function contains code that is common to both s10_getdents() and
 772  * s10_getdents64().  See the comment above s10_getdents() for details.
 773  *
 774  * rval, fd, buf, and nbyte should be passed unmodified from s10_getdents()
 775  * and s10_getdents64().  getdents_syscall_id should be either SYS_getdents
 776  * or SYS_getdents64.  name_offset should be the the byte offset of
 777  * the d_name field in the dirent structures passed to the kernel via the
 778  * syscall represented by getdents_syscall_id.  reclen_offset should be
 779  * the byte offset of the d_reclen field in the aforementioned dirent
 780  * structures.
 781  */
 782 static int
 783 s10_getdents_common(sysret_t *rval, int fd, char *buf, size_t nbyte,
 784     int getdents_syscall_id, size_t name_offset, size_t reclen_offset)
 785 {
 786         int err;
 787         size_t buf_size;
 788         char *local_buf;
 789         char *buf_current;
 790 
 791         /*
 792          * Use a special brand operation, B_S10_ISFDXATTRDIR, to determine
 793          * whether the specified file descriptor refers to an extended file
 794          * attribute directory.  If it doesn't, then SYS_getdents won't
 795          * reveal extended file attributes, in which case we can simply
 796          * hand the syscall to the native kernel.
 797          */
 798         if ((err = __systemcall(rval, SYS_brand + 1024, B_S10_ISFDXATTRDIR,
 799             fd)) != 0)
 800                 return (err);
 801         if (rval->sys_rval1 == 0)
 802                 return (__systemcall(rval, getdents_syscall_id + 1024, fd, buf,
 803                     nbyte));
 804 
 805         /*
 806          * The file descriptor refers to an extended file attributes directory.
 807          * We need to create a dirent buffer that's as large as buf into which
 808          * the native SYS_getdents will store the special extended file
 809          * attribute directory's entries.  We can't dereference buf because
 810          * it might be an invalid pointer!
 811          */
 812         if (nbyte > MAXGETDENTS_SIZE)
 813                 nbyte = MAXGETDENTS_SIZE;
 814         local_buf = (char *)malloc(nbyte);
 815         if (local_buf == NULL) {
 816                 /*
 817                  * getdents(2) doesn't return an error code indicating a memory
 818                  * allocation error and it doesn't make sense to return any of
 819                  * its documented error codes for a malloc(3C) failure.  We'll
 820                  * use ENOMEM even though getdents(2) doesn't use it because it
 821                  * best describes the failure.
 822                  */
 823                 (void) B_TRUSS_POINT_3(rval, getdents_syscall_id, ENOMEM, fd,
 824                     buf, nbyte);
 825                 rval->sys_rval1 = -1;
 826                 rval->sys_rval2 = 0;
 827                 return (EIO);
 828         }
 829 
 830         /*
 831          * Issue a native SYS_getdents syscall but use our local dirent buffer
 832          * instead of buf.  This will allow us to examine the returned dirent
 833          * structures immediately and copy them to buf later.  That way the
 834          * calling process won't be able to see the dirent structures until
 835          * we finish examining them.
 836          */
 837         if ((err = __systemcall(rval, getdents_syscall_id + 1024, fd, local_buf,
 838             nbyte)) != 0) {
 839                 free(local_buf);
 840                 return (err);
 841         }
 842         buf_size = rval->sys_rval1;
 843         if (buf_size == 0) {
 844                 free(local_buf);
 845                 return (0);
 846         }
 847 
 848         /*
 849          * Look for SUNWattr_ro (VIEW_READONLY) and SUNWattr_rw
 850          * (VIEW_READWRITE) in the directory entries and remove them
 851          * from the dirent buffer.
 852          */
 853         for (buf_current = local_buf;
 854             (size_t)(buf_current - local_buf) < buf_size; /* cstyle */) {
 855                 if (strcmp(dirent_name(buf_current), VIEW_READONLY) != 0 &&
 856                     strcmp(dirent_name(buf_current), VIEW_READWRITE) != 0) {
 857                         /*
 858                          * The dirent refers to an attribute that should
 859                          * be visible to Solaris 10 processes.  Keep it
 860                          * and examine the next entry in the buffer.
 861                          */
 862                         buf_current += dirent_reclen(buf_current);
 863                 } else {
 864                         /*
 865                          * We found either SUNWattr_ro (VIEW_READONLY)
 866                          * or SUNWattr_rw (VIEW_READWRITE).  Remove it
 867                          * from the dirent buffer by decrementing
 868                          * buf_size by the size of the entry and
 869                          * overwriting the entry with the remaining
 870                          * entries.
 871                          */
 872                         buf_size -= dirent_reclen(buf_current);
 873                         (void) memmove(buf_current, buf_current +
 874                             dirent_reclen(buf_current), buf_size -
 875                             (size_t)(buf_current - local_buf));
 876                 }
 877         }
 878 
 879         /*
 880          * Copy local_buf into buf so that the calling process can see
 881          * the results.
 882          */
 883         if ((err = brand_uucopy(local_buf, buf, buf_size)) != 0) {
 884                 free(local_buf);
 885                 rval->sys_rval1 = -1;
 886                 rval->sys_rval2 = 0;
 887                 return (err);
 888         }
 889         rval->sys_rval1 = buf_size;
 890         free(local_buf);
 891         return (0);
 892 }
 893 
 894 /*
 895  * Solaris Next added two special extended file attributes, SUNWattr_ro and
 896  * SUNWattr_rw, which are called "extended system attributes".  They have
 897  * special semantics (e.g., a process cannot unlink SUNWattr_ro) and should
 898  * not appear in solaris10-branded zones because no Solaris 10 applications,
 899  * including system commands such as tar(1), are coded to correctly handle these
 900  * special attributes.
 901  *
 902  * This emulation function solves the aforementioned problem by emulating
 903  * the getdents(2) syscall and filtering both system attributes out of resulting
 904  * directory entry lists.  The emulation function only filters results when
 905  * the given file descriptor refers to an extended file attribute directory.
 906  * Filtering getdents(2) results is expensive because it requires dynamic
 907  * memory allocation; however, the performance cost is tolerable because
 908  * we don't expect Solaris 10 processes to frequently examine extended file
 909  * attribute directories.
 910  *
 911  * The brand's emulation library needs two getdents(2) emulation functions
 912  * because getdents(2) comes in two flavors: non-largefile-aware getdents(2)
 913  * and largefile-aware getdents64(2).  s10_getdents() handles the non-largefile-
 914  * aware case for 32-bit processes and all getdents(2) syscalls for 64-bit
 915  * processes (64-bit processes use largefile-aware interfaces by default).
 916  * See s10_getdents64() below for the largefile-aware getdents64(2) emulation
 917  * function for 32-bit processes.
 918  */
 919 static int
 920 s10_getdents(sysret_t *rval, int fd, struct dirent *buf, size_t nbyte)
 921 {
 922         return (s10_getdents_common(rval, fd, (char *)buf, nbyte, SYS_getdents,
 923             offsetof(struct dirent, d_name),
 924             offsetof(struct dirent, d_reclen)));
 925 }
 926 
 927 #ifndef _LP64
 928 /*
 929  * This is the largefile-aware version of getdents(2) for 32-bit processes.
 930  * This exists for the same reason that s10_getdents() exists.  See the comment
 931  * above s10_getdents().
 932  */
 933 static int
 934 s10_getdents64(sysret_t *rval, int fd, struct dirent64 *buf, size_t nbyte)
 935 {
 936         return (s10_getdents_common(rval, fd, (char *)buf, nbyte,
 937             SYS_getdents64, offsetof(struct dirent64, d_name),
 938             offsetof(struct dirent64, d_reclen)));
 939 }
 940 #endif  /* !_LP64 */
 941 
 942 #define S10_TRIVIAL_ACL_CNT     6
 943 #define NATIVE_TRIVIAL_ACL_CNT  3
 944 
 945 /*
 946  * Check if the ACL qualifies as a trivial ACL based on the native
 947  * interpretation.
 948  */
 949 static boolean_t
 950 has_trivial_native_acl(int cmd, int cnt, const char *fname, int fd)
 951 {
 952         int i, err;
 953         sysret_t rval;
 954         ace_t buf[NATIVE_TRIVIAL_ACL_CNT];
 955 
 956         if (fname != NULL)
 957                 err = __systemcall(&rval, SYS_pathconf + 1024, fname,
 958                     _PC_ACL_ENABLED);
 959         else
 960                 err = __systemcall(&rval, SYS_fpathconf + 1024, fd,
 961                     _PC_ACL_ENABLED);
 962         if (err != 0 || rval.sys_rval1 != _ACL_ACE_ENABLED)
 963                 return (B_FALSE);
 964 
 965         /*
 966          * If we just got the ACL cnt, we don't need to get it again, its
 967          * passed in as the cnt arg.
 968          */
 969         if (cmd != ACE_GETACLCNT) {
 970                 if (fname != NULL) {
 971                         if (__systemcall(&rval, SYS_acl + 1024, fname,
 972                             ACE_GETACLCNT, 0, NULL) != 0)
 973                                 return (B_FALSE);
 974                 } else {
 975                         if (__systemcall(&rval, SYS_facl + 1024, fd,
 976                             ACE_GETACLCNT, 0, NULL) != 0)
 977                                 return (B_FALSE);
 978                 }
 979                 cnt = rval.sys_rval1;
 980         }
 981 
 982         if (cnt != NATIVE_TRIVIAL_ACL_CNT)
 983                 return (B_FALSE);
 984 
 985         if (fname != NULL) {
 986                 if (__systemcall(&rval, SYS_acl + 1024, fname, ACE_GETACL, cnt,
 987                     buf) != 0)
 988                         return (B_FALSE);
 989         } else {
 990                 if (__systemcall(&rval, SYS_facl + 1024, fd, ACE_GETACL, cnt,
 991                     buf) != 0)
 992                         return (B_FALSE);
 993         }
 994 
 995         /*
 996          * The following is based on the logic from the native OS
 997          * ace_trivial_common() to determine if the native ACL is trivial.
 998          */
 999         for (i = 0; i < cnt; i++) {
1000                 switch (buf[i].a_flags & ACE_TYPE_FLAGS) {
1001                 case ACE_OWNER:
1002                 case ACE_GROUP|ACE_IDENTIFIER_GROUP:
1003                 case ACE_EVERYONE:
1004                         break;
1005                 default:
1006                         return (B_FALSE);
1007                 }
1008 
1009                 if (buf[i].a_flags & (ACE_FILE_INHERIT_ACE|
1010                     ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
1011                     ACE_INHERIT_ONLY_ACE))
1012                         return (B_FALSE);
1013 
1014                 /*
1015                  * Special check for some special bits
1016                  *
1017                  * Don't allow anybody to deny reading basic
1018                  * attributes or a files ACL.
1019                  */
1020                 if (buf[i].a_access_mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
1021                     buf[i].a_type == ACE_ACCESS_DENIED_ACE_TYPE)
1022                         return (B_FALSE);
1023 
1024                 /*
1025                  * Delete permissions are never set by default
1026                  */
1027                 if (buf[i].a_access_mask & (ACE_DELETE|ACE_DELETE_CHILD))
1028                         return (B_FALSE);
1029                 /*
1030                  * only allow owner@ to have
1031                  * write_acl/write_owner/write_attributes/write_xattr/
1032                  */
1033                 if (buf[i].a_type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
1034                     (!(buf[i].a_flags & ACE_OWNER) && (buf[i].a_access_mask &
1035                     (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
1036                     ACE_WRITE_NAMED_ATTRS))))
1037                         return (B_FALSE);
1038 
1039         }
1040 
1041         return (B_TRUE);
1042 }
1043 
1044 /*
1045  * The following logic is based on the S10 adjust_ace_pair_common() code.
1046  */
1047 static void
1048 s10_adjust_ace_mask(void *pair, size_t access_off, size_t pairsize, mode_t mode)
1049 {
1050         char *datap = (char *)pair;
1051         uint32_t *amask0 = (uint32_t *)(uintptr_t)(datap + access_off);
1052         uint32_t *amask1 = (uint32_t *)(uintptr_t)(datap + pairsize +
1053             access_off);
1054 
1055         if (mode & S_IROTH)
1056                 *amask1 |= ACE_READ_DATA;
1057         else
1058                 *amask0 |= ACE_READ_DATA;
1059         if (mode & S_IWOTH)
1060                 *amask1 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
1061         else
1062                 *amask0 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
1063         if (mode & S_IXOTH)
1064                 *amask1 |= ACE_EXECUTE;
1065         else
1066                 *amask0 |= ACE_EXECUTE;
1067 }
1068 
1069 /*
1070  * Construct a trivial S10 style ACL.
1071  */
1072 static int
1073 make_trivial_s10_acl(const char *fname, int fd, ace_t *bp)
1074 {
1075         int err;
1076         sysret_t rval;
1077         struct stat64 buf;
1078         ace_t trivial_s10_acl[] = {
1079                 {(uint_t)-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE},
1080                 {(uint_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
1081                     ACE_WRITE_NAMED_ATTRS, ACE_OWNER,
1082                     ACE_ACCESS_ALLOWED_ACE_TYPE},
1083                 {(uint_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
1084                     ACE_ACCESS_DENIED_ACE_TYPE},
1085                 {(uint_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
1086                     ACE_ACCESS_ALLOWED_ACE_TYPE},
1087                 {(uint_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
1088                     ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE,
1089                     ACE_ACCESS_DENIED_ACE_TYPE},
1090                 {(uint_t)-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|
1091                     ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE, ACE_EVERYONE,
1092                     ACE_ACCESS_ALLOWED_ACE_TYPE}
1093         };
1094 
1095         if (fname != NULL) {
1096                 if ((err = __systemcall(&rval, SYS_fstatat64 + 1024, AT_FDCWD,
1097                     fname, &buf, 0)) != 0)
1098                         return (err);
1099         } else {
1100                 if ((err = __systemcall(&rval, SYS_fstatat64 + 1024, fd,
1101                     NULL, &buf, 0)) != 0)
1102                         return (err);
1103         }
1104 
1105         s10_adjust_ace_mask(&trivial_s10_acl[0], offsetof(ace_t, a_access_mask),
1106             sizeof (ace_t), (buf.st_mode & 0700) >> 6);
1107         s10_adjust_ace_mask(&trivial_s10_acl[2], offsetof(ace_t, a_access_mask),
1108             sizeof (ace_t), (buf.st_mode & 0070) >> 3);
1109         s10_adjust_ace_mask(&trivial_s10_acl[4], offsetof(ace_t, a_access_mask),
1110             sizeof (ace_t), buf.st_mode & 0007);
1111 
1112         if (brand_uucopy(&trivial_s10_acl, bp, sizeof (trivial_s10_acl)) != 0)
1113                 return (EFAULT);
1114 
1115         return (0);
1116 }
1117 
1118 /*
1119  * The definition of a trivial ace-style ACL (used by ZFS and NFSv4) has been
1120  * simplified since S10.  Instead of 6 entries on a trivial S10 ACE ACL we now
1121  * have 3 streamlined entries.  The new, simpler trivial style confuses S10
1122  * commands such as 'ls -v' or 'cp -p' which don't see the expected S10 trivial
1123  * ACL entries and thus assume that there is a complex ACL on the file.
1124  *
1125  * See: PSARC/2010/029 Improved ACL interoperability
1126  *
1127  * Note that the trival ACL detection code is implemented in acl_trival() in
1128  * lib/libsec/common/aclutils.c.  It always uses the acl() syscall (not the
1129  * facl syscall) to determine if an ACL is trivial.  However, we emulate both
1130  * acl() and facl() so that the two provide consistent results.
1131  *
1132  * We don't currently try to emulate setting of ACLs since the primary
1133  * consumer of this feature is SMB or NFSv4 servers, neither of which are
1134  * supported in solaris10-branded zones.  If ACLs are used they must be set on
1135  * files using the native OS interpretation.
1136  */
1137 int
1138 s10_acl(sysret_t *rval, const char *fname, int cmd, int nentries, void *aclbufp)
1139 {
1140         int res;
1141 
1142         res = __systemcall(rval, SYS_acl + 1024, fname, cmd, nentries, aclbufp);
1143 
1144         switch (cmd) {
1145         case ACE_GETACLCNT:
1146                 if (res == 0 && has_trivial_native_acl(ACE_GETACLCNT,
1147                     rval->sys_rval1, fname, 0)) {
1148                         rval->sys_rval1 = S10_TRIVIAL_ACL_CNT;
1149                 }
1150                 break;
1151         case ACE_GETACL:
1152                 if (res == 0 &&
1153                     has_trivial_native_acl(ACE_GETACL, 0, fname, 0) &&
1154                     nentries >= S10_TRIVIAL_ACL_CNT) {
1155                         res = make_trivial_s10_acl(fname, 0, aclbufp);
1156                         rval->sys_rval1 = S10_TRIVIAL_ACL_CNT;
1157                 }
1158                 break;
1159         }
1160 
1161         return (res);
1162 }
1163 
1164 int
1165 s10_facl(sysret_t *rval, int fdes, int cmd, int nentries, void *aclbufp)
1166 {
1167         int res;
1168 
1169         res = __systemcall(rval, SYS_facl + 1024, fdes, cmd, nentries, aclbufp);
1170 
1171         switch (cmd) {
1172         case ACE_GETACLCNT:
1173                 if (res == 0 && has_trivial_native_acl(ACE_GETACLCNT,
1174                     rval->sys_rval1, NULL, fdes)) {
1175                         rval->sys_rval1 = S10_TRIVIAL_ACL_CNT;
1176                 }
1177                 break;
1178         case ACE_GETACL:
1179                 if (res == 0 &&
1180                     has_trivial_native_acl(ACE_GETACL, 0, NULL, fdes) &&
1181                     nentries >= S10_TRIVIAL_ACL_CNT) {
1182                         res = make_trivial_s10_acl(NULL, fdes, aclbufp);
1183                         rval->sys_rval1 = S10_TRIVIAL_ACL_CNT;
1184                 }
1185                 break;
1186         }
1187 
1188         return (res);
1189 }
1190 
1191 #define S10_AC_PROC             (0x1 << 28)
1192 #define S10_AC_TASK             (0x2 << 28)
1193 #define S10_AC_FLOW             (0x4 << 28)
1194 #define S10_AC_MODE(x)          ((x) & 0xf0000000)
1195 #define S10_AC_OPTION(x)        ((x) & 0x0fffffff)
1196 
1197 /*
1198  * The mode shift, mode mask and option mask for acctctl have changed.  The
1199  * mode is currently the top full byte and the option is the lower 3 full bytes.
1200  */
1201 int
1202 s10_acctctl(sysret_t *rval, int cmd, void *buf, size_t bufsz)
1203 {
1204         int mode = S10_AC_MODE(cmd);
1205         int option = S10_AC_OPTION(cmd);
1206 
1207         switch (mode) {
1208         case S10_AC_PROC:
1209                 mode = AC_PROC;
1210                 break;
1211         case S10_AC_TASK:
1212                 mode = AC_TASK;
1213                 break;
1214         case S10_AC_FLOW:
1215                 mode = AC_FLOW;
1216                 break;
1217         default:
1218                 return (B_TRUSS_POINT_3(rval, SYS_acctctl, EINVAL, cmd, buf,
1219                     bufsz));
1220         }
1221 
1222         return (__systemcall(rval, SYS_acctctl + 1024, mode | option, buf,
1223             bufsz));
1224 }
1225 
1226 /*
1227  * The Audit Policy parameters have changed due to:
1228  *    6466722 audituser and AUDIT_USER are defined, unused, undocumented and
1229  *            should be removed.
1230  *
1231  * In S10 we had the following flag:
1232  *      #define AUDIT_USER 0x0040
1233  * which doesn't exist in Solaris Next where the subsequent flags are shifted
1234  * down.  For example, in S10 we had:
1235  *      #define AUDIT_GROUP     0x0080
1236  * but on Solaris Next we have:
1237  *      #define AUDIT_GROUP     0x0040
1238  * AUDIT_GROUP has the value AUDIT_USER had in S10 and all of the subsequent
1239  * bits are also shifted one place.
1240  *
1241  * When we're getting or setting the Audit Policy parameters we need to
1242  * shift the outgoing or incoming bits into their proper positions.  Since
1243  * S10_AUDIT_USER was always unused, we always clear that bit on A_GETPOLICY.
1244  *
1245  * The command we care about, BSM_AUDITCTL, passes the most parameters (3),
1246  * so declare this function to take up to 4 args and just pass them on.
1247  * The number of parameters for s10_auditsys needs to be equal to the BSM_*
1248  * subcommand that has the most parameters, since we want to pass all
1249  * parameters through, regardless of which subcommands we interpose on.
1250  *
1251  * Note that the auditsys system call uses the SYSENT_AP macro wrapper instead
1252  * of the more common SYSENT_CI macro.  This means the return value is a
1253  * SE_64RVAL so the syscall table uses RV_64RVAL.
1254  */
1255 
1256 #define S10_AUDIT_HMASK 0xffffffc0
1257 #define S10_AUDIT_LMASK 0x3f
1258 #define S10_AUC_NOSPACE 0x3
1259 
1260 int
1261 s10_auditsys(sysret_t *rval, int bsmcmd, intptr_t a0, intptr_t a1, intptr_t a2)
1262 {
1263         int         err;
1264         uint32_t    m;
1265 
1266         if (bsmcmd != BSM_AUDITCTL)
1267                 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, a1,
1268                     a2));
1269 
1270         if ((int)a0 == A_GETPOLICY) {
1271                 if ((err = __systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0,
1272                     &m, a2)) != 0)
1273                         return (err);
1274                 m = ((m & S10_AUDIT_HMASK) << 1) | (m & S10_AUDIT_LMASK);
1275                 if (brand_uucopy(&m, (void *)a1, sizeof (m)) != 0)
1276                         return (EFAULT);
1277                 return (0);
1278 
1279         } else if ((int)a0 == A_SETPOLICY) {
1280                 if (brand_uucopy((const void *)a1, &m, sizeof (m)) != 0)
1281                         return (EFAULT);
1282                 m = ((m >> 1) & S10_AUDIT_HMASK) | (m & S10_AUDIT_LMASK);
1283                 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, &m,
1284                     a2));
1285         } else if ((int)a0 == A_GETCOND) {
1286                 if ((err = __systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0,
1287                     &m, a2)) != 0)
1288                         return (err);
1289                 if (m == AUC_NOSPACE)
1290                         m = S10_AUC_NOSPACE;
1291                 if (brand_uucopy(&m, (void *)a1, sizeof (m)) != 0)
1292                         return (EFAULT);
1293                 return (0);
1294         } else if ((int)a0 == A_SETCOND) {
1295                 if (brand_uucopy((const void *)a1, &m, sizeof (m)) != 0)
1296                         return (EFAULT);
1297                 if (m == S10_AUC_NOSPACE)
1298                         m = AUC_NOSPACE;
1299                 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, &m,
1300                     a2));
1301         }
1302 
1303         return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, a1, a2));
1304 }
1305 
1306 /*
1307  * Determine whether the executable passed to SYS_exec or SYS_execve is a
1308  * native executable.  The s10_npreload.so invokes the B_S10_NATIVE brand
1309  * operation which patches up the processes exec info to eliminate any trace
1310  * of the wrapper.  That will make pgrep and other commands that examine
1311  * process' executable names and command-line parameters work properly.
1312  */
1313 static int
1314 s10_exec_native(sysret_t *rval, const char *fname, const char **argp,
1315     const char **envp)
1316 {
1317         const char *filename = fname;
1318         char path[64];
1319         int err;
1320 
1321         /* Get a copy of the executable we're trying to run */
1322         path[0] = '\0';
1323         (void) brand_uucopystr(filename, path, sizeof (path));
1324 
1325         /* Check if we're trying to run a native binary */
1326         if (strncmp(path, "/.SUNWnative/usr/lib/brand/solaris10/s10_native",
1327             sizeof (path)) != 0)
1328                 return (0);
1329 
1330         /* Skip the first element in the argv array */
1331         argp++;
1332 
1333         /*
1334          * The the path of the dynamic linker is the second parameter
1335          * of s10_native_exec().
1336          */
1337         if (brand_uucopy(argp, &filename, sizeof (char *)) != 0)
1338                 return (EFAULT);
1339 
1340         /* If an exec call succeeds, it never returns */
1341         err = __systemcall(rval, SYS_brand + 1024, B_EXEC_NATIVE, filename,
1342             argp, envp, NULL, NULL, NULL);
1343         brand_assert(err != 0);
1344         return (err);
1345 }
1346 
1347 /*
1348  * Interpose on the SYS_exec syscall to detect native wrappers.
1349  */
1350 int
1351 s10_exec(sysret_t *rval, const char *fname, const char **argp)
1352 {
1353         int err;
1354 
1355         if ((err = s10_exec_native(rval, fname, argp, NULL)) != 0)
1356                 return (err);
1357 
1358         /* If an exec call succeeds, it never returns */
1359         err = __systemcall(rval, SYS_execve + 1024, fname, argp, NULL);
1360         brand_assert(err != 0);
1361         return (err);
1362 }
1363 
1364 /*
1365  * Interpose on the SYS_execve syscall to detect native wrappers.
1366  */
1367 int
1368 s10_execve(sysret_t *rval, const char *fname, const char **argp,
1369     const char **envp)
1370 {
1371         int err;
1372 
1373         if ((err = s10_exec_native(rval, fname, argp, envp)) != 0)
1374                 return (err);
1375 
1376         /* If an exec call succeeds, it never returns */
1377         err = __systemcall(rval, SYS_execve + 1024, fname, argp, envp);
1378         brand_assert(err != 0);
1379         return (err);
1380 }
1381 
1382 /*
1383  * S10's issetugid() syscall is now a subcode to privsys().
1384  */
1385 static int
1386 s10_issetugid(sysret_t *rval)
1387 {
1388         return (__systemcall(rval, SYS_privsys + 1024, PRIVSYS_ISSETUGID,
1389             0, 0, 0, 0, 0));
1390 }
1391 
1392 /*
1393  * S10's socket() syscall does not split type and flags
1394  */
1395 static int
1396 s10_so_socket(sysret_t *rval, int domain, int type, int protocol,
1397     char *devpath, int version)
1398 {
1399         if ((type & ~SOCK_TYPE_MASK) != 0) {
1400                 errno = EINVAL;
1401                 return (-1);
1402         }
1403         return (__systemcall(rval, SYS_so_socket + 1024, domain, type,
1404             protocol, devpath, version));
1405 }
1406 
1407 /*
1408  * S10's pipe() syscall has a different calling convention
1409  */
1410 static int
1411 s10_pipe(sysret_t *rval)
1412 {
1413         int fds[2], err;
1414         if ((err = __systemcall(rval, SYS_pipe + 1024, fds, 0)) != 0)
1415                 return (err);
1416 
1417         rval->sys_rval1 = fds[0];
1418         rval->sys_rval2 = fds[1];
1419         return (0);
1420 }
1421 
1422 /*
1423  * S10's accept() syscall takes three arguments
1424  */
1425 static int
1426 s10_accept(sysret_t *rval, int sock, struct sockaddr *addr, uint_t *addrlen,
1427     int version)
1428 {
1429         return (__systemcall(rval, SYS_accept + 1024, sock, addr, addrlen,
1430             version, 0));
1431 }
1432 
1433 static long
1434 s10_uname(sysret_t *rv, uintptr_t p1)
1435 {
1436         struct utsname un, *unp = (struct utsname *)p1;
1437         int rev, err;
1438 
1439         if ((err = __systemcall(rv, SYS_uname + 1024, &un)) != 0)
1440                 return (err);
1441 
1442         rev = atoi(&un.release[2]);
1443         brand_assert(rev >= 11);
1444         bzero(un.release, _SYS_NMLN);
1445         (void) strlcpy(un.release, S10_UTS_RELEASE, _SYS_NMLN);
1446         bzero(un.version, _SYS_NMLN);
1447         (void) strlcpy(un.version, S10_UTS_VERSION, _SYS_NMLN);
1448 
1449         /* copy out the modified uname info */
1450         return (brand_uucopy(&un, unp, sizeof (un)));
1451 }
1452 
1453 int
1454 s10_sysconfig(sysret_t *rv, int which)
1455 {
1456         long value;
1457 
1458         /*
1459          * We must interpose on the sysconfig(2) requests
1460          * that deal with the realtime signal number range.
1461          * All others get passed to the native sysconfig(2).
1462          */
1463         switch (which) {
1464         case _CONFIG_RTSIG_MAX:
1465                 value = S10_SIGRTMAX - S10_SIGRTMIN + 1;
1466                 break;
1467         case _CONFIG_SIGRT_MIN:
1468                 value = S10_SIGRTMIN;
1469                 break;
1470         case _CONFIG_SIGRT_MAX:
1471                 value = S10_SIGRTMAX;
1472                 break;
1473         default:
1474                 return (__systemcall(rv, SYS_sysconfig + 1024, which));
1475         }
1476 
1477         (void) B_TRUSS_POINT_1(rv, SYS_sysconfig, 0, which);
1478         rv->sys_rval1 = value;
1479         rv->sys_rval2 = 0;
1480 
1481         return (0);
1482 }
1483 
1484 int
1485 s10_sysinfo(sysret_t *rv, int command, char *buf, long count)
1486 {
1487         char *value;
1488         int len;
1489 
1490         /*
1491          * We must interpose on the sysinfo(2) commands SI_RELEASE and
1492          * SI_VERSION; all others get passed to the native sysinfo(2)
1493          * command.
1494          */
1495         switch (command) {
1496                 case SI_RELEASE:
1497                         value = S10_UTS_RELEASE;
1498                         break;
1499 
1500                 case SI_VERSION:
1501                         value = S10_UTS_VERSION;
1502                         break;
1503 
1504                 default:
1505                         /*
1506                          * The default action is to pass the command to the
1507                          * native sysinfo(2) syscall.
1508                          */
1509                         return (__systemcall(rv, SYS_systeminfo + 1024,
1510                             command, buf, count));
1511         }
1512 
1513         len = strlen(value) + 1;
1514         if (count > 0) {
1515                 if (brand_uucopystr(value, buf, count) != 0)
1516                         return (EFAULT);
1517 
1518                 /*
1519                  * Assure NULL termination of buf as brand_uucopystr() doesn't.
1520                  */
1521                 if (len > count && brand_uucopy("\0", buf + (count - 1), 1)
1522                     != 0)
1523                         return (EFAULT);
1524         }
1525 
1526         /*
1527          * On success, sysinfo(2) returns the size of buffer required to hold
1528          * the complete value plus its terminating NULL byte.
1529          */
1530         (void) B_TRUSS_POINT_3(rv, SYS_systeminfo, 0, command, buf, count);
1531         rv->sys_rval1 = len;
1532         rv->sys_rval2 = 0;
1533         return (0);
1534 }
1535 
1536 #if defined(__x86)
1537 #if defined(__amd64)
1538 /*
1539  * 64-bit x86 LWPs created by SYS_lwp_create start here if they need to set
1540  * their %fs registers to the legacy Solaris 10 selector value.
1541  *
1542  * This function does three things:
1543  *
1544  *      1.  Trap to the kernel so that it can set %fs to the legacy Solaris 10
1545  *          selector value.
1546  *      2.  Read the LWP's true entry point (the entry point supplied by libc
1547  *          when SYS_lwp_create was invoked) from %r14.
1548  *      3.  Eliminate this function's stack frame and pass control to the LWP's
1549  *          true entry point.
1550  *
1551  * See the comment above s10_lwp_create_correct_fs() (see below) for the reason
1552  * why this function exists.
1553  */
1554 /*ARGSUSED*/
1555 static void
1556 s10_lwp_create_entry_point(void *ulwp_structp)
1557 {
1558         sysret_t rval;
1559 
1560         /*
1561          * The new LWP's %fs register is initially zero, but libc won't
1562          * function correctly when %fs is zero.  Change the LWP's %fs register
1563          * via SYS_brand.
1564          */
1565         (void) __systemcall(&rval, SYS_brand + 1024, B_S10_FSREGCORRECTION);
1566 
1567         /*
1568          * Jump to the true entry point, which is stored in %r14.
1569          * Remove our stack frame before jumping so that
1570          * s10_lwp_create_entry_point() won't be seen in stack traces.
1571          *
1572          * NOTE: s10_lwp_create_entry_point() pushes %r12 onto its stack frame
1573          * so that it can use it as a temporary register.  We don't restore %r12
1574          * in this assembly block because we don't care about its value (and
1575          * neither does _lwp_start()).  Besides, the System V ABI AMD64
1576          * Actirecture Processor Supplement doesn't specify that %r12 should
1577          * have a special value when LWPs start, so we can ignore its value when
1578          * we jump to the true entry point.  Furthermore, %r12 is a callee-saved
1579          * register, so the true entry point should push %r12 onto its stack
1580          * before using the register.  We ignore %r14 after we read it for
1581          * similar reasons.
1582          *
1583          * NOTE: The compiler will generate a function epilogue for this
1584          * function despite the fact that the LWP will never execute it.
1585          * We could hand-code this entire function in assembly to eliminate
1586          * the epilogue, but the epilogue is only three or four instructions,
1587          * so we wouldn't save much space.  Besides, why would we want
1588          * to create yet another ugly, hard-to-maintain assembly function when
1589          * we could write most of it in C?
1590          */
1591         __asm__ __volatile__(
1592             "movq %0, %%rdi\n\t"        /* pass ulwp_structp as arg1 */
1593             "movq %%rbp, %%rsp\n\t"     /* eliminate the stack frame */
1594             "popq %%rbp\n\t"
1595             "jmp *%%r14\n\t"            /* jump to the true entry point */
1596             : : "r" (ulwp_structp));
1597         /*NOTREACHED*/
1598 }
1599 
1600 /*
1601  * The S10 libc expects that %fs will be nonzero for new 64-bit x86 LWPs but the
1602  * Nevada kernel clears %fs for such LWPs.  Unforunately, new LWPs do not issue
1603  * SYS_lwp_private (see s10_lwp_private() below) after they are created, so
1604  * we must ensure that new LWPs invoke a brand operation that sets %fs to a
1605  * nonzero value immediately after their creation.
1606  *
1607  * The easiest way to do this is to make new LWPs start at a special function,
1608  * s10_lwp_create_entry_point() (see its definition above), that invokes the
1609  * brand operation that corrects %fs.  We'll store the entry points of new LWPs
1610  * in their %r14 registers so that s10_lwp_create_entry_point() can find and
1611  * call them after invoking the special brand operation.  %r14 is a callee-saved
1612  * register; therefore, any functions invoked by s10_lwp_create_entry_point()
1613  * and all functions dealing with signals (e.g., sigacthandler()) will preserve
1614  * %r14 for s10_lwp_create_entry_point().
1615  *
1616  * The Nevada kernel can safely work with nonzero %fs values because the kernel
1617  * configures per-thread %fs segment descriptors so that the legacy %fs selector
1618  * value will still work.  See the comment in lwp_load() regarding %fs and
1619  * %fsbase in 64-bit x86 processes.
1620  *
1621  * This emulation exists thanks to CRs 6467491 and 6501650.
1622  */
1623 static int
1624 s10_lwp_create_correct_fs(sysret_t *rval, ucontext_t *ucp, int flags,
1625     id_t *new_lwp)
1626 {
1627         ucontext_t s10_uc;
1628 
1629         /*
1630          * Copy the supplied ucontext_t structure to the local stack
1631          * frame and store the new LWP's entry point (the value of %rip
1632          * stored in the ucontext_t) in the new LWP's %r14 register.
1633          * Then make s10_lwp_create_entry_point() the new LWP's entry
1634          * point.
1635          */
1636         if (brand_uucopy(ucp, &s10_uc, sizeof (s10_uc)) != 0)
1637                 return (EFAULT);
1638 
1639         s10_uc.uc_mcontext.gregs[REG_R14] = s10_uc.uc_mcontext.gregs[REG_RIP];
1640         s10_uc.uc_mcontext.gregs[REG_RIP] = (greg_t)s10_lwp_create_entry_point;
1641 
1642         /*  fix up the signal mask */
1643         if (s10_uc.uc_flags & UC_SIGMASK)
1644                 (void) s10sigset_to_native(&s10_uc.uc_sigmask,
1645                     &s10_uc.uc_sigmask);
1646 
1647         /*
1648          * Issue SYS_lwp_create to create the new LWP.  We pass the
1649          * modified ucontext_t to make sure that the new LWP starts at
1650          * s10_lwp_create_entry_point().
1651          */
1652         return (__systemcall(rval, SYS_lwp_create + 1024, &s10_uc,
1653             flags, new_lwp));
1654 }
1655 #endif  /* __amd64 */
1656 
1657 /*
1658  * SYS_lwp_private is issued by libc_init() to set %fsbase in 64-bit x86
1659  * processes.  The Nevada kernel sets %fs to zero but the S10 libc expects
1660  * %fs to be nonzero.  We'll pass the issued system call to the kernel untouched
1661  * and invoke a brand operation to set %fs to the legacy S10 selector value.
1662  *
1663  * This emulation exists thanks to CRs 6467491 and 6501650.
1664  */
1665 static int
1666 s10_lwp_private(sysret_t *rval, int cmd, int which, uintptr_t base)
1667 {
1668 #if defined(__amd64)
1669         int err;
1670 
1671         /*
1672          * The current LWP's %fs register should be zero.  Determine whether the
1673          * Solaris 10 libc with which we're working functions correctly when %fs
1674          * is zero by calling thr_main() after issuing the SYS_lwp_private
1675          * syscall.  If thr_main() barfs (returns -1), then change the LWP's %fs
1676          * register via SYS_brand and patch brand_sysent_table so that issuing
1677          * SYS_lwp_create executes s10_lwp_create_correct_fs() rather than the
1678          * default s10_lwp_create().  s10_lwp_create_correct_fs() will
1679          * guarantee that new LWPs will have correct %fs values.
1680          */
1681         if ((err = __systemcall(rval, SYS_lwp_private + 1024, cmd, which,
1682             base)) != 0)
1683                 return (err);
1684         if (thr_main() == -1) {
1685                 /*
1686                  * SYS_lwp_private is only issued by libc_init(), which is
1687                  * executed when libc is first loaded by ld.so.1.  Thus we
1688                  * are guaranteed to be single-threaded at this point.  Even
1689                  * if we were multithreaded at this point, writing a 64-bit
1690                  * value to the st_callc field of a brand_sysent_table
1691                  * entry is guaranteed to be atomic on 64-bit x86 chips
1692                  * as long as the field is not split across cache lines
1693                  * (It shouldn't be.).  See chapter 8, section 1.1 of
1694                  * "The Intel 64 and IA32 Architectures Software Developer's
1695                  * Manual," Volume 3A for more details.
1696                  */
1697                 brand_sysent_table[SYS_lwp_create].st_callc =
1698                     (sysent_cb_t)s10_lwp_create_correct_fs;
1699                 return (__systemcall(rval, SYS_brand + 1024,
1700                     B_S10_FSREGCORRECTION));
1701         }
1702         return (0);
1703 #else   /* !__amd64 */
1704         return (__systemcall(rval, SYS_lwp_private + 1024, cmd, which, base));
1705 #endif  /* !__amd64 */
1706 }
1707 #endif  /* __x86 */
1708 
1709 /*
1710  * The Opensolaris versions of lwp_mutex_timedlock() and lwp_mutex_trylock()
1711  * add an extra argument to the interfaces, a uintptr_t value for the mutex's
1712  * mutex_owner field.  The Solaris 10 libc assigns the mutex_owner field at
1713  * user-level, so we just make the extra argument be zero in both syscalls.
1714  */
1715 
1716 static int
1717 s10_lwp_mutex_timedlock(sysret_t *rval, lwp_mutex_t *lp, timespec_t *tsp)
1718 {
1719         return (__systemcall(rval, SYS_lwp_mutex_timedlock + 1024, lp, tsp, 0));
1720 }
1721 
1722 static int
1723 s10_lwp_mutex_trylock(sysret_t *rval, lwp_mutex_t *lp)
1724 {
1725         return (__systemcall(rval, SYS_lwp_mutex_trylock + 1024, lp, 0));
1726 }
1727 
1728 /*
1729  * If the emul_global_zone flag is set then emulate some aspects of the
1730  * zone system call.  In particular, emulate the global zone ID on the
1731  * ZONE_LOOKUP subcommand and emulate some of the global zone attributes
1732  * on the ZONE_GETATTR subcommand.  If the flag is not set or we're performing
1733  * some other operation, simply pass the calls through.
1734  */
1735 int
1736 s10_zone(sysret_t *rval, int cmd, void *arg1, void *arg2, void *arg3,
1737     void *arg4)
1738 {
1739         char            *aval;
1740         int             len;
1741         zoneid_t        zid;
1742         int             attr;
1743         char            *buf;
1744         size_t          bufsize;
1745 
1746         /*
1747          * We only emulate the zone syscall for a subset of specific commands,
1748          * otherwise we just pass the call through.
1749          */
1750         if (!emul_global_zone)
1751                 return (__systemcall(rval, SYS_zone + 1024, cmd, arg1, arg2,
1752                     arg3, arg4));
1753 
1754         switch (cmd) {
1755         case ZONE_LOOKUP:
1756                 (void) B_TRUSS_POINT_1(rval, SYS_zone, 0, cmd);
1757                 rval->sys_rval1 = GLOBAL_ZONEID;
1758                 rval->sys_rval2 = 0;
1759                 return (0);
1760 
1761         case ZONE_GETATTR:
1762                 zid = (zoneid_t)(uintptr_t)arg1;
1763                 attr = (int)(uintptr_t)arg2;
1764                 buf = (char *)arg3;
1765                 bufsize = (size_t)arg4;
1766 
1767                 /*
1768                  * If the request is for the global zone then we're emulating
1769                  * that, otherwise pass this thru.
1770                  */
1771                 if (zid != GLOBAL_ZONEID)
1772                         goto passthru;
1773 
1774                 switch (attr) {
1775                 case ZONE_ATTR_NAME:
1776                         aval = GLOBAL_ZONENAME;
1777                         break;
1778 
1779                 case ZONE_ATTR_BRAND:
1780                         aval = NATIVE_BRAND_NAME;
1781                         break;
1782                 default:
1783                         /*
1784                          * We only emulate a subset of the attrs, use the
1785                          * real zone id to pass thru the rest.
1786                          */
1787                         arg1 = (void *)(uintptr_t)zoneid;
1788                         goto passthru;
1789                 }
1790 
1791                 (void) B_TRUSS_POINT_5(rval, SYS_zone, 0, cmd, zid, attr,
1792                     buf, bufsize);
1793 
1794                 len = strlen(aval) + 1;
1795                 if (len > bufsize)
1796                         return (ENAMETOOLONG);
1797 
1798                 if (buf != NULL) {
1799                         if (len == 1) {
1800                                 if (brand_uucopy("\0", buf, 1) != 0)
1801                                         return (EFAULT);
1802                         } else {
1803                                 if (brand_uucopystr(aval, buf, len) != 0)
1804                                         return (EFAULT);
1805 
1806                                 /*
1807                                  * Assure NULL termination of "buf" as
1808                                  * brand_uucopystr() does NOT.
1809                                  */
1810                                 if (brand_uucopy("\0", buf + (len - 1), 1) != 0)
1811                                         return (EFAULT);
1812                         }
1813                 }
1814 
1815                 rval->sys_rval1 = len;
1816                 rval->sys_rval2 = 0;
1817                 return (0);
1818 
1819         default:
1820                 break;
1821         }
1822 
1823 passthru:
1824         return (__systemcall(rval, SYS_zone + 1024, cmd, arg1, arg2, arg3,
1825             arg4));
1826 }
1827 
1828 /*ARGSUSED*/
1829 int
1830 brand_init(int argc, char *argv[], char *envp[])
1831 {
1832         sysret_t                rval;
1833         ulong_t                 ldentry;
1834         int                     err;
1835         char                    *bname;
1836 
1837         brand_pre_init();
1838 
1839         /*
1840          * Cache the pid of the zone's init process and determine if
1841          * we're init(1m) for the zone.  Remember: we might be init
1842          * now, but as soon as we fork(2) we won't be.
1843          */
1844         (void) get_initpid_info();
1845 
1846         /* get the current zoneid */
1847         err = __systemcall(&rval, SYS_zone, ZONE_LOOKUP, NULL);
1848         brand_assert(err == 0);
1849         zoneid = (zoneid_t)rval.sys_rval1;
1850 
1851         /* Get the zone's emulation bitmap. */
1852         if ((err = __systemcall(&rval, SYS_zone, ZONE_GETATTR, zoneid,
1853             S10_EMUL_BITMAP, emul_bitmap, sizeof (emul_bitmap))) != 0) {
1854                 brand_abort(err, "The zone's patch level is unsupported");
1855                 /*NOTREACHED*/
1856         }
1857 
1858         bname = basename(argv[0]);
1859 
1860         /*
1861          * In general we want the S10 commands that are zone-aware to continue
1862          * to behave as they normally do within a zone.  Since these commands
1863          * are zone-aware, they should continue to "do the right thing".
1864          * However, some zone-aware commands aren't going to work the way
1865          * we expect them to inside the branded zone.  In particular, the pkg
1866          * and patch commands will not properly manage all pkgs/patches
1867          * unless the commands think they are running in the global zone.  For
1868          * these commands we want to emulate the global zone.
1869          *
1870          * We don't do any emulation for pkgcond since it is typically used
1871          * in pkg/patch postinstall scripts and we want those scripts to do
1872          * the right thing inside a zone.
1873          *
1874          * One issue is the handling of hollow pkgs.  Since the pkgs are
1875          * hollow, they won't use pkgcond in their postinstall scripts.  These
1876          * pkgs typically are installing drivers so we handle that by
1877          * replacing add_drv and rem_drv in the s10_boot script.
1878          */
1879         if (strcmp("pkgadd", bname) == 0 || strcmp("pkgrm", bname) == 0 ||
1880             strcmp("patchadd", bname) == 0 || strcmp("patchrm", bname) == 0)
1881                 emul_global_zone = B_TRUE;
1882 
1883         ldentry = brand_post_init(S10_VERSION, argc, argv, envp);
1884 
1885         brand_runexe(argv, ldentry);
1886         /*NOTREACHED*/
1887         brand_abort(0, "brand_runexe() returned");
1888         return (-1);
1889 }
1890 
1891 /*
1892  * This table must have at least NSYSCALL entries in it.
1893  *
1894  * The second parameter of each entry in the brand_sysent_table
1895  * contains the number of parameters and flags that describe the
1896  * syscall return value encoding.  See the block comments at the
1897  * top of this file for more information about the syscall return
1898  * value flags and when they should be used.
1899  */
1900 brand_sysent_table_t brand_sysent_table[] = {
1901 #if defined(__sparc) && !defined(__sparcv9)
1902         EMULATE(brand_indir, 9 | RV_64RVAL),    /*  0 */
1903 #else
1904         NOSYS,                                  /*  0 */
1905 #endif
1906         NOSYS,                                  /*   1 */
1907         EMULATE(s10_forkall, 0 | RV_32RVAL2),   /*   2 */
1908         NOSYS,                                  /*   3 */
1909         NOSYS,                                  /*   4 */
1910         EMULATE(s10_open, 3 | RV_DEFAULT),      /*   5 */
1911         NOSYS,                                  /*   6 */
1912         EMULATE(s10_wait, 0 | RV_32RVAL2),      /*   7 */
1913         EMULATE(s10_creat, 2 | RV_DEFAULT),     /*   8 */
1914         EMULATE(s10_link, 2 | RV_DEFAULT),      /*   9 */
1915         EMULATE(s10_unlink, 1 | RV_DEFAULT),    /*  10 */
1916         EMULATE(s10_exec, 2 | RV_DEFAULT),      /*  11 */
1917         NOSYS,                                  /*  12 */
1918         NOSYS,                                  /*  13 */
1919         EMULATE(s10_mknod, 3 | RV_DEFAULT),     /*  14 */
1920         EMULATE(s10_chmod, 2 | RV_DEFAULT),     /*  15 */
1921         EMULATE(s10_chown, 3 | RV_DEFAULT),     /*  16 */
1922         NOSYS,                                  /*  17 */
1923         EMULATE(s10_stat, 2 | RV_DEFAULT),      /*  18 */
1924         NOSYS,                                  /*  19 */
1925         NOSYS,                                  /*  20 */
1926         NOSYS,                                  /*  21 */
1927         EMULATE(s10_umount, 1 | RV_DEFAULT),    /*  22 */
1928         NOSYS,                                  /*  23 */
1929         NOSYS,                                  /*  24 */
1930         NOSYS,                                  /*  25 */
1931         NOSYS,                                  /*  26 */
1932         NOSYS,                                  /*  27 */
1933         EMULATE(s10_fstat, 2 | RV_DEFAULT),     /*  28 */
1934         NOSYS,                                  /*  29 */
1935         EMULATE(s10_utime, 2 | RV_DEFAULT),     /*  30 */
1936         NOSYS,                                  /*  31 */
1937         NOSYS,                                  /*  32 */
1938         EMULATE(s10_access, 2 | RV_DEFAULT),    /*  33 */
1939         NOSYS,                                  /*  34 */
1940         NOSYS,                                  /*  35 */
1941         NOSYS,                                  /*  36 */
1942         EMULATE(s10_kill, 2 | RV_DEFAULT),      /*  37 */
1943         NOSYS,                                  /*  38 */
1944         NOSYS,                                  /*  39 */
1945         NOSYS,                                  /*  40 */
1946         EMULATE(s10_dup, 1 | RV_DEFAULT),       /*  41 */
1947         EMULATE(s10_pipe, 0 | RV_32RVAL2),      /*  42 */
1948         NOSYS,                                  /*  43 */
1949         NOSYS,                                  /*  44 */
1950         NOSYS,                                  /*  45 */
1951         NOSYS,                                  /*  46 */
1952         NOSYS,                                  /*  47 */
1953         NOSYS,                                  /*  48 */
1954         NOSYS,                                  /*  49 */
1955         NOSYS,                                  /*  50 */
1956         NOSYS,                                  /*  51 */
1957         NOSYS,                                  /*  52 */
1958         NOSYS,                                  /*  53 */
1959         EMULATE(s10_ioctl, 3 | RV_DEFAULT),     /*  54 */
1960         NOSYS,                                  /*  55 */
1961         NOSYS,                                  /*  56 */
1962         NOSYS,                                  /*  57 */
1963         NOSYS,                                  /*  58 */
1964         EMULATE(s10_execve, 3 | RV_DEFAULT),    /*  59 */
1965         NOSYS,                                  /*  60 */
1966         NOSYS,                                  /*  61 */
1967         NOSYS,                                  /*  62 */
1968         NOSYS,                                  /*  63 */
1969         NOSYS,                                  /*  64 */
1970         NOSYS,                                  /*  65 */
1971         NOSYS,                                  /*  66 */
1972         NOSYS,                                  /*  67 */
1973         NOSYS,                                  /*  68 */
1974         NOSYS,                                  /*  69 */
1975         NOSYS,                                  /*  70 */
1976         EMULATE(s10_acctctl, 3 | RV_DEFAULT),   /*  71 */
1977         NOSYS,                                  /*  72 */
1978         NOSYS,                                  /*  73 */
1979         NOSYS,                                  /*  74 */
1980         EMULATE(s10_issetugid, 0 | RV_DEFAULT), /*  75 */
1981         EMULATE(s10_fsat, 6 | RV_DEFAULT),      /*  76 */
1982         NOSYS,                                  /*  77 */
1983         NOSYS,                                  /*  78 */
1984         EMULATE(s10_rmdir, 1 | RV_DEFAULT),     /*  79 */
1985         EMULATE(s10_mkdir, 2 | RV_DEFAULT),     /*  80 */
1986         EMULATE(s10_getdents, 3 | RV_DEFAULT),  /*  81 */
1987         NOSYS,                                  /*  82 */
1988         NOSYS,                                  /*  83 */
1989         NOSYS,                                  /*  84 */
1990         NOSYS,                                  /*  85 */
1991         NOSYS,                                  /*  86 */
1992         EMULATE(s10_poll, 3 | RV_DEFAULT),      /*  87 */
1993         EMULATE(s10_lstat, 2 | RV_DEFAULT),     /*  88 */
1994         EMULATE(s10_symlink, 2 | RV_DEFAULT),   /*  89 */
1995         EMULATE(s10_readlink, 3 | RV_DEFAULT),  /*  90 */
1996         NOSYS,                                  /*  91 */
1997         NOSYS,                                  /*  92 */
1998         EMULATE(s10_fchmod, 2 | RV_DEFAULT),    /*  93 */
1999         EMULATE(s10_fchown, 3 | RV_DEFAULT),    /*  94 */
2000         EMULATE(s10_sigprocmask, 3 | RV_DEFAULT), /*  95 */
2001         EMULATE(s10_sigsuspend, 1 | RV_DEFAULT), /*  96 */
2002         NOSYS,                                  /*  97 */
2003         EMULATE(s10_sigaction, 3 | RV_DEFAULT), /*  98 */
2004         EMULATE(s10_sigpending, 2 | RV_DEFAULT), /*  99 */
2005         NOSYS,                                  /* 100 */
2006         NOSYS,                                  /* 101 */
2007         NOSYS,                                  /* 102 */
2008         NOSYS,                                  /* 103 */
2009         NOSYS,                                  /* 104 */
2010         NOSYS,                                  /* 105 */
2011         NOSYS,                                  /* 106 */
2012         EMULATE(s10_waitid, 4 | RV_DEFAULT),    /* 107 */
2013         EMULATE(s10_sigsendsys, 2 | RV_DEFAULT), /* 108 */
2014         NOSYS,                                  /* 109 */
2015         NOSYS,                                  /* 110 */
2016         NOSYS,                                  /* 111 */
2017         NOSYS,                                  /* 112 */
2018         NOSYS,                                  /* 113 */
2019         NOSYS,                                  /* 114 */
2020         NOSYS,                                  /* 115 */
2021         NOSYS,                                  /* 116 */
2022         NOSYS,                                  /* 117 */
2023         NOSYS,                                  /* 118 */
2024         NOSYS,                                  /* 119 */
2025         NOSYS,                                  /* 120 */
2026         NOSYS,                                  /* 121 */
2027         NOSYS,                                  /* 122 */
2028 #if defined(__x86)
2029         EMULATE(s10_xstat, 3 | RV_DEFAULT),     /* 123 */
2030         EMULATE(s10_lxstat, 3 | RV_DEFAULT),    /* 124 */
2031         EMULATE(s10_fxstat, 3 | RV_DEFAULT),    /* 125 */
2032         EMULATE(s10_xmknod, 4 | RV_DEFAULT),    /* 126 */
2033 #else
2034         NOSYS,                                  /* 123 */
2035         NOSYS,                                  /* 124 */
2036         NOSYS,                                  /* 125 */
2037         NOSYS,                                  /* 126 */
2038 #endif
2039         NOSYS,                                  /* 127 */
2040         NOSYS,                                  /* 128 */
2041         NOSYS,                                  /* 129 */
2042         EMULATE(s10_lchown, 3 | RV_DEFAULT),    /* 130 */
2043         NOSYS,                                  /* 131 */
2044         NOSYS,                                  /* 132 */
2045         NOSYS,                                  /* 133 */
2046         EMULATE(s10_rename, 2 | RV_DEFAULT),    /* 134 */
2047         EMULATE(s10_uname, 1 | RV_DEFAULT),     /* 135 */
2048         NOSYS,                                  /* 136 */
2049         EMULATE(s10_sysconfig, 1 | RV_DEFAULT), /* 137 */
2050         NOSYS,                                  /* 138 */
2051         EMULATE(s10_sysinfo, 3 | RV_DEFAULT),   /* 139 */
2052         NOSYS,                                  /* 140 */
2053         NOSYS,                                  /* 141 */
2054         NOSYS,                                  /* 142 */
2055         EMULATE(s10_fork1, 0 | RV_32RVAL2),     /* 143 */
2056         EMULATE(s10_sigtimedwait, 3 | RV_DEFAULT), /* 144 */
2057         NOSYS,                                  /* 145 */
2058         NOSYS,                                  /* 146 */
2059         EMULATE(s10_lwp_sema_wait, 1 | RV_DEFAULT), /* 147 */
2060         NOSYS,                                  /* 148 */
2061         NOSYS,                                  /* 149 */
2062         NOSYS,                                  /* 150 */
2063         NOSYS,                                  /* 151 */
2064         NOSYS,                                  /* 152 */
2065         NOSYS,                                  /* 153 */
2066         EMULATE(s10_utimes, 2 | RV_DEFAULT),    /* 154 */
2067         NOSYS,                                  /* 155 */
2068         NOSYS,                                  /* 156 */
2069         NOSYS,                                  /* 157 */
2070         NOSYS,                                  /* 158 */
2071         EMULATE(s10_lwp_create, 3 | RV_DEFAULT), /* 159 */
2072         NOSYS,                                  /* 160 */
2073         NOSYS,                                  /* 161 */
2074         NOSYS,                                  /* 162 */
2075         EMULATE(s10_lwp_kill, 2 | RV_DEFAULT),  /* 163 */
2076         NOSYS,                                  /* 164 */
2077         EMULATE(s10_lwp_sigmask, 3 | RV_32RVAL2), /* 165 */
2078 #if defined(__x86)
2079         EMULATE(s10_lwp_private, 3 | RV_DEFAULT), /* 166 */
2080 #else
2081         NOSYS,                                  /* 166 */
2082 #endif
2083         NOSYS,                                  /* 167 */
2084         NOSYS,                                  /* 168 */
2085         EMULATE(s10_lwp_mutex_lock, 1 | RV_DEFAULT), /* 169 */
2086         NOSYS,                                  /* 170 */
2087         NOSYS,                                  /* 171 */
2088         NOSYS,                                  /* 172 */
2089         NOSYS,                                  /* 173 */
2090         EMULATE(s10_pwrite, 4 | RV_DEFAULT),    /* 174 */
2091         NOSYS,                                  /* 175 */
2092         NOSYS,                                  /* 176 */
2093         NOSYS,                                  /* 177 */
2094         NOSYS,                                  /* 178 */
2095         NOSYS,                                  /* 179 */
2096         NOSYS,                                  /* 180 */
2097         NOSYS,                                  /* 181 */
2098         NOSYS,                                  /* 182 */
2099         NOSYS,                                  /* 183 */
2100         NOSYS,                                  /* 184 */
2101         EMULATE(s10_acl, 4 | RV_DEFAULT),       /* 185 */
2102         EMULATE(s10_auditsys, 4 | RV_64RVAL),   /* 186 */
2103         NOSYS,                                  /* 187 */
2104         NOSYS,                                  /* 188 */
2105         NOSYS,                                  /* 189 */
2106         EMULATE(s10_sigqueue, 4 | RV_DEFAULT),  /* 190 */
2107         NOSYS,                                  /* 191 */
2108         NOSYS,                                  /* 192 */
2109         NOSYS,                                  /* 193 */
2110         NOSYS,                                  /* 194 */
2111         NOSYS,                                  /* 195 */
2112         NOSYS,                                  /* 196 */
2113         NOSYS,                                  /* 197 */
2114         NOSYS,                                  /* 198 */
2115         NOSYS,                                  /* 199 */
2116         EMULATE(s10_facl, 4 | RV_DEFAULT),      /* 200 */
2117         NOSYS,                                  /* 201 */
2118         NOSYS,                                  /* 202 */
2119         NOSYS,                                  /* 203 */
2120         NOSYS,                                  /* 204 */
2121         EMULATE(s10_signotify, 3 | RV_DEFAULT), /* 205 */
2122         NOSYS,                                  /* 206 */
2123         NOSYS,                                  /* 207 */
2124         NOSYS,                                  /* 208 */
2125         NOSYS,                                  /* 209 */
2126         EMULATE(s10_lwp_mutex_timedlock, 2 | RV_DEFAULT), /* 210 */
2127         NOSYS,                                  /* 211 */
2128         NOSYS,                                  /* 212 */
2129 #if defined(_LP64)
2130         NOSYS,                                  /* 213 */
2131 #else
2132         EMULATE(s10_getdents64, 3 | RV_DEFAULT), /* 213 */
2133 #endif
2134         NOSYS,                                  /* 214 */
2135 #if defined(_LP64)
2136         NOSYS,                                  /* 215 */
2137         NOSYS,                                  /* 216 */
2138         NOSYS,                                  /* 217 */
2139 #else
2140         EMULATE(s10_stat64, 2 | RV_DEFAULT),    /* 215 */
2141         EMULATE(s10_lstat64, 2 | RV_DEFAULT),   /* 216 */
2142         EMULATE(s10_fstat64, 2 | RV_DEFAULT),   /* 217 */
2143 #endif
2144         NOSYS,                                  /* 218 */
2145         NOSYS,                                  /* 219 */
2146         NOSYS,                                  /* 220 */
2147         NOSYS,                                  /* 221 */
2148         NOSYS,                                  /* 222 */
2149 #if defined(_LP64)
2150         NOSYS,                                  /* 223 */
2151         NOSYS,                                  /* 224 */
2152         NOSYS,                                  /* 225 */
2153 #else
2154         EMULATE(s10_pwrite64, 5 | RV_DEFAULT),  /* 223 */
2155         EMULATE(s10_creat64, 2 | RV_DEFAULT),   /* 224 */
2156         EMULATE(s10_open64, 3 | RV_DEFAULT),    /* 225 */
2157 #endif
2158         NOSYS,                                  /* 226 */
2159         EMULATE(s10_zone, 5 | RV_DEFAULT),      /* 227 */
2160         NOSYS,                                  /* 228 */
2161         NOSYS,                                  /* 229 */
2162         EMULATE(s10_so_socket, 5 | RV_DEFAULT), /* 230 */
2163         NOSYS,                                  /* 231 */
2164         NOSYS,                                  /* 232 */
2165         NOSYS,                                  /* 233 */
2166         EMULATE(s10_accept, 4 | RV_DEFAULT),    /* 234 */
2167         NOSYS,                                  /* 235 */
2168         NOSYS,                                  /* 236 */
2169         NOSYS,                                  /* 237 */
2170         NOSYS,                                  /* 238 */
2171         NOSYS,                                  /* 239 */
2172         NOSYS,                                  /* 240 */
2173         NOSYS,                                  /* 241 */
2174         NOSYS,                                  /* 242 */
2175         NOSYS,                                  /* 243 */
2176         NOSYS,                                  /* 244 */
2177         NOSYS,                                  /* 245 */
2178         NOSYS,                                  /* 246 */
2179         NOSYS,                                  /* 247 */
2180         NOSYS,                                  /* 248 */
2181         NOSYS,                                  /* 249 */
2182         NOSYS,                                  /* 250 */
2183         EMULATE(s10_lwp_mutex_trylock, 1 | RV_DEFAULT), /* 251 */
2184         NOSYS,                                  /* 252 */
2185         NOSYS,                                  /* 253 */
2186         NOSYS,                                  /* 254 */
2187         NOSYS                                   /* 255 */
2188 };