1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2013, Joyent Inc. All rights reserved.
  25  */
  26 
  27 /*
  28  * Zones
  29  *
  30  *   A zone is a named collection of processes, namespace constraints,
  31  *   and other system resources which comprise a secure and manageable
  32  *   application containment facility.
  33  *
  34  *   Zones (represented by the reference counted zone_t) are tracked in
  35  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  36  *   (zoneid_t) are used to track zone association.  Zone IDs are
  37  *   dynamically generated when the zone is created; if a persistent
  38  *   identifier is needed (core files, accounting logs, audit trail,
  39  *   etc.), the zone name should be used.
  40  *
  41  *
  42  *   Global Zone:
  43  *
  44  *   The global zone (zoneid 0) is automatically associated with all
  45  *   system resources that have not been bound to a user-created zone.
  46  *   This means that even systems where zones are not in active use
  47  *   have a global zone, and all processes, mounts, etc. are
  48  *   associated with that zone.  The global zone is generally
  49  *   unconstrained in terms of privileges and access, though the usual
  50  *   credential and privilege based restrictions apply.
  51  *
  52  *
  53  *   Zone States:
  54  *
  55  *   The states in which a zone may be in and the transitions are as
  56  *   follows:
  57  *
  58  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  59  *   initialized zone is added to the list of active zones on the system but
  60  *   isn't accessible.
  61  *
  62  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  63  *   not yet completed. Not possible to enter the zone, but attributes can
  64  *   be retrieved.
  65  *
  66  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  67  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  68  *   executed.  A zone remains in this state until it transitions into
  69  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  70  *
  71  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  72  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  73  *   state.
  74  *
  75  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  76  *   successfully started init.   A zone remains in this state until
  77  *   zone_shutdown() is called.
  78  *
  79  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  80  *   killing all processes running in the zone. The zone remains
  81  *   in this state until there are no more user processes running in the zone.
  82  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  83  *   Since zone_shutdown() is restartable, it may be called successfully
  84  *   multiple times for the same zone_t.  Setting of the zone's state to
  85  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  86  *   the zone's status without worrying about it being a moving target.
  87  *
  88  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  89  *   are no more user processes in the zone.  The zone remains in this
  90  *   state until there are no more kernel threads associated with the
  91  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  92  *   fail.
  93  *
  94  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  95  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  96  *   join the zone or create kernel threads therein.
  97  *
  98  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
  99  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 100  *   return NULL from now on.
 101  *
 102  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 103  *   processes or threads doing work on behalf of the zone.  The zone is
 104  *   removed from the list of active zones.  zone_destroy() returns, and
 105  *   the zone can be recreated.
 106  *
 107  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 108  *   callbacks are executed, and all memory associated with the zone is
 109  *   freed.
 110  *
 111  *   Threads can wait for the zone to enter a requested state by using
 112  *   zone_status_wait() or zone_status_timedwait() with the desired
 113  *   state passed in as an argument.  Zone state transitions are
 114  *   uni-directional; it is not possible to move back to an earlier state.
 115  *
 116  *
 117  *   Zone-Specific Data:
 118  *
 119  *   Subsystems needing to maintain zone-specific data can store that
 120  *   data using the ZSD mechanism.  This provides a zone-specific data
 121  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 122  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 123  *   to register callbacks to be invoked when a zone is created, shut
 124  *   down, or destroyed.  This can be used to initialize zone-specific
 125  *   data for new zones and to clean up when zones go away.
 126  *
 127  *
 128  *   Data Structures:
 129  *
 130  *   The per-zone structure (zone_t) is reference counted, and freed
 131  *   when all references are released.  zone_hold and zone_rele can be
 132  *   used to adjust the reference count.  In addition, reference counts
 133  *   associated with the cred_t structure are tracked separately using
 134  *   zone_cred_hold and zone_cred_rele.
 135  *
 136  *   Pointers to active zone_t's are stored in two hash tables; one
 137  *   for searching by id, the other for searching by name.  Lookups
 138  *   can be performed on either basis, using zone_find_by_id and
 139  *   zone_find_by_name.  Both return zone_t pointers with the zone
 140  *   held, so zone_rele should be called when the pointer is no longer
 141  *   needed.  Zones can also be searched by path; zone_find_by_path
 142  *   returns the zone with which a path name is associated (global
 143  *   zone if the path is not within some other zone's file system
 144  *   hierarchy).  This currently requires iterating through each zone,
 145  *   so it is slower than an id or name search via a hash table.
 146  *
 147  *
 148  *   Locking:
 149  *
 150  *   zonehash_lock: This is a top-level global lock used to protect the
 151  *       zone hash tables and lists.  Zones cannot be created or destroyed
 152  *       while this lock is held.
 153  *   zone_status_lock: This is a global lock protecting zone state.
 154  *       Zones cannot change state while this lock is held.  It also
 155  *       protects the list of kernel threads associated with a zone.
 156  *   zone_lock: This is a per-zone lock used to protect several fields of
 157  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 158  *       this lock means that the zone cannot go away.
 159  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 160  *       related to the zone.max-lwps rctl.
 161  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 162  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 163  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 164  *       currently just max_lofi
 165  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 166  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 167  *       list (a list of zones in the ZONE_IS_DEAD state).
 168  *
 169  *   Ordering requirements:
 170  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 171  *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 172  *
 173  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 174  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 175  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 176  *
 177  *   Blocking memory allocations are permitted while holding any of the
 178  *   zone locks.
 179  *
 180  *
 181  *   System Call Interface:
 182  *
 183  *   The zone subsystem can be managed and queried from user level with
 184  *   the following system calls (all subcodes of the primary "zone"
 185  *   system call):
 186  *   - zone_create: creates a zone with selected attributes (name,
 187  *     root path, privileges, resource controls, ZFS datasets)
 188  *   - zone_enter: allows the current process to enter a zone
 189  *   - zone_getattr: reports attributes of a zone
 190  *   - zone_setattr: set attributes of a zone
 191  *   - zone_boot: set 'init' running for the zone
 192  *   - zone_list: lists all zones active in the system
 193  *   - zone_lookup: looks up zone id based on name
 194  *   - zone_shutdown: initiates shutdown process (see states above)
 195  *   - zone_destroy: completes shutdown process (see states above)
 196  *
 197  */
 198 
 199 #include <sys/priv_impl.h>
 200 #include <sys/cred.h>
 201 #include <c2/audit.h>
 202 #include <sys/debug.h>
 203 #include <sys/file.h>
 204 #include <sys/kmem.h>
 205 #include <sys/kstat.h>
 206 #include <sys/mutex.h>
 207 #include <sys/note.h>
 208 #include <sys/pathname.h>
 209 #include <sys/proc.h>
 210 #include <sys/project.h>
 211 #include <sys/sysevent.h>
 212 #include <sys/task.h>
 213 #include <sys/systm.h>
 214 #include <sys/types.h>
 215 #include <sys/utsname.h>
 216 #include <sys/vnode.h>
 217 #include <sys/vfs.h>
 218 #include <sys/systeminfo.h>
 219 #include <sys/policy.h>
 220 #include <sys/cred_impl.h>
 221 #include <sys/contract_impl.h>
 222 #include <sys/contract/process_impl.h>
 223 #include <sys/class.h>
 224 #include <sys/pool.h>
 225 #include <sys/pool_pset.h>
 226 #include <sys/pset.h>
 227 #include <sys/strlog.h>
 228 #include <sys/sysmacros.h>
 229 #include <sys/callb.h>
 230 #include <sys/vmparam.h>
 231 #include <sys/corectl.h>
 232 #include <sys/ipc_impl.h>
 233 #include <sys/klpd.h>
 234 
 235 #include <sys/door.h>
 236 #include <sys/cpuvar.h>
 237 #include <sys/sdt.h>
 238 
 239 #include <sys/uadmin.h>
 240 #include <sys/session.h>
 241 #include <sys/cmn_err.h>
 242 #include <sys/modhash.h>
 243 #include <sys/sunddi.h>
 244 #include <sys/nvpair.h>
 245 #include <sys/rctl.h>
 246 #include <sys/fss.h>
 247 #include <sys/brand.h>
 248 #include <sys/zone.h>
 249 #include <net/if.h>
 250 #include <sys/cpucaps.h>
 251 #include <vm/seg.h>
 252 #include <sys/mac.h>
 253 
 254 /*
 255  * This constant specifies the number of seconds that threads waiting for
 256  * subsystems to release a zone's general-purpose references will wait before
 257  * they log the zone's reference counts.  The constant's value shouldn't
 258  * be so small that reference counts are unnecessarily reported for zones
 259  * whose references are slowly released.  On the other hand, it shouldn't be so
 260  * large that users reboot their systems out of frustration over hung zones
 261  * before the system logs the zones' reference counts.
 262  */
 263 #define ZONE_DESTROY_TIMEOUT_SECS       60
 264 
 265 /* List of data link IDs which are accessible from the zone */
 266 typedef struct zone_dl {
 267         datalink_id_t   zdl_id;
 268         nvlist_t        *zdl_net;
 269         list_node_t     zdl_linkage;
 270 } zone_dl_t;
 271 
 272 /*
 273  * cv used to signal that all references to the zone have been released.  This
 274  * needs to be global since there may be multiple waiters, and the first to
 275  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 276  */
 277 static kcondvar_t zone_destroy_cv;
 278 /*
 279  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 280  * but then we'd need another lock for zone_destroy_cv, and why bother?
 281  */
 282 static kmutex_t zone_status_lock;
 283 
 284 /*
 285  * ZSD-related global variables.
 286  */
 287 static kmutex_t zsd_key_lock;   /* protects the following two */
 288 /*
 289  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 290  */
 291 static zone_key_t zsd_keyval = 0;
 292 /*
 293  * Global list of registered keys.  We use this when a new zone is created.
 294  */
 295 static list_t zsd_registered_keys;
 296 
 297 int zone_hash_size = 256;
 298 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 299 static kmutex_t zonehash_lock;
 300 static uint_t zonecount;
 301 static id_space_t *zoneid_space;
 302 
 303 /*
 304  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 305  * kernel proper runs, and which manages all other zones.
 306  *
 307  * Although not declared as static, the variable "zone0" should not be used
 308  * except for by code that needs to reference the global zone early on in boot,
 309  * before it is fully initialized.  All other consumers should use
 310  * 'global_zone'.
 311  */
 312 zone_t zone0;
 313 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 314 
 315 /*
 316  * List of active zones, protected by zonehash_lock.
 317  */
 318 static list_t zone_active;
 319 
 320 /*
 321  * List of destroyed zones that still have outstanding cred references.
 322  * Used for debugging.  Uses a separate lock to avoid lock ordering
 323  * problems in zone_free.
 324  */
 325 static list_t zone_deathrow;
 326 static kmutex_t zone_deathrow_lock;
 327 
 328 /* number of zones is limited by virtual interface limit in IP */
 329 uint_t maxzones = 8192;
 330 
 331 /* Event channel to sent zone state change notifications */
 332 evchan_t *zone_event_chan;
 333 
 334 /*
 335  * This table holds the mapping from kernel zone states to
 336  * states visible in the state notification API.
 337  * The idea is that we only expose "obvious" states and
 338  * do not expose states which are just implementation details.
 339  */
 340 const char  *zone_status_table[] = {
 341         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 342         ZONE_EVENT_INITIALIZED,         /* initialized */
 343         ZONE_EVENT_READY,               /* ready */
 344         ZONE_EVENT_READY,               /* booting */
 345         ZONE_EVENT_RUNNING,             /* running */
 346         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 347         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 349         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 350         ZONE_EVENT_UNINITIALIZED,       /* dead */
 351 };
 352 
 353 /*
 354  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 355  * (see sys/zone.h).
 356  */
 357 static char *zone_ref_subsys_names[] = {
 358         "NFS",          /* ZONE_REF_NFS */
 359         "NFSv4",        /* ZONE_REF_NFSV4 */
 360         "SMBFS",        /* ZONE_REF_SMBFS */
 361         "MNTFS",        /* ZONE_REF_MNTFS */
 362         "LOFI",         /* ZONE_REF_LOFI */
 363         "VFS",          /* ZONE_REF_VFS */
 364         "IPC"           /* ZONE_REF_IPC */
 365 };
 366 
 367 /*
 368  * This isn't static so lint doesn't complain.
 369  */
 370 rctl_hndl_t rc_zone_cpu_shares;
 371 rctl_hndl_t rc_zone_locked_mem;
 372 rctl_hndl_t rc_zone_max_swap;
 373 rctl_hndl_t rc_zone_max_lofi;
 374 rctl_hndl_t rc_zone_cpu_cap;
 375 rctl_hndl_t rc_zone_nlwps;
 376 rctl_hndl_t rc_zone_nprocs;
 377 rctl_hndl_t rc_zone_shmmax;
 378 rctl_hndl_t rc_zone_shmmni;
 379 rctl_hndl_t rc_zone_semmni;
 380 rctl_hndl_t rc_zone_msgmni;
 381 /*
 382  * Synchronization primitives used to synchronize between mounts and zone
 383  * creation/destruction.
 384  */
 385 static int mounts_in_progress;
 386 static kcondvar_t mount_cv;
 387 static kmutex_t mount_lock;
 388 
 389 const char * const zone_default_initname = "/sbin/init";
 390 static char * const zone_prefix = "/zone/";
 391 static int zone_shutdown(zoneid_t zoneid);
 392 static int zone_add_datalink(zoneid_t, datalink_id_t);
 393 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 394 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 395 static int zone_set_network(zoneid_t, zone_net_data_t *);
 396 static int zone_get_network(zoneid_t, zone_net_data_t *);
 397 
 398 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 399 
 400 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 401 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 402 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 403 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 404     zone_key_t);
 405 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 406 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 407     kmutex_t *);
 408 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 409     kmutex_t *);
 410 
 411 /*
 412  * Bump this number when you alter the zone syscall interfaces; this is
 413  * because we need to have support for previous API versions in libc
 414  * to support patching; libc calls into the kernel to determine this number.
 415  *
 416  * Version 1 of the API is the version originally shipped with Solaris 10
 417  * Version 2 alters the zone_create system call in order to support more
 418  *     arguments by moving the args into a structure; and to do better
 419  *     error reporting when zone_create() fails.
 420  * Version 3 alters the zone_create system call in order to support the
 421  *     import of ZFS datasets to zones.
 422  * Version 4 alters the zone_create system call in order to support
 423  *     Trusted Extensions.
 424  * Version 5 alters the zone_boot system call, and converts its old
 425  *     bootargs parameter to be set by the zone_setattr API instead.
 426  * Version 6 adds the flag argument to zone_create.
 427  */
 428 static const int ZONE_SYSCALL_API_VERSION = 6;
 429 
 430 /*
 431  * Certain filesystems (such as NFS and autofs) need to know which zone
 432  * the mount is being placed in.  Because of this, we need to be able to
 433  * ensure that a zone isn't in the process of being created such that
 434  * nfs_mount() thinks it is in the global zone, while by the time it
 435  * gets added the list of mounted zones, it ends up on zoneA's mount
 436  * list.
 437  *
 438  * The following functions: block_mounts()/resume_mounts() and
 439  * mount_in_progress()/mount_completed() are used by zones and the VFS
 440  * layer (respectively) to synchronize zone creation and new mounts.
 441  *
 442  * The semantics are like a reader-reader lock such that there may
 443  * either be multiple mounts (or zone creations, if that weren't
 444  * serialized by zonehash_lock) in progress at the same time, but not
 445  * both.
 446  *
 447  * We use cv's so the user can ctrl-C out of the operation if it's
 448  * taking too long.
 449  *
 450  * The semantics are such that there is unfair bias towards the
 451  * "current" operation.  This means that zone creations may starve if
 452  * there is a rapid succession of new mounts coming in to the system, or
 453  * there is a remote possibility that zones will be created at such a
 454  * rate that new mounts will not be able to proceed.
 455  */
 456 /*
 457  * Prevent new mounts from progressing to the point of calling
 458  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 459  * them to complete.
 460  */
 461 static int
 462 block_mounts(void)
 463 {
 464         int retval = 0;
 465 
 466         /*
 467          * Since it may block for a long time, block_mounts() shouldn't be
 468          * called with zonehash_lock held.
 469          */
 470         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 471         mutex_enter(&mount_lock);
 472         while (mounts_in_progress > 0) {
 473                 if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
 474                         goto signaled;
 475         }
 476         /*
 477          * A negative value of mounts_in_progress indicates that mounts
 478          * have been blocked by (-mounts_in_progress) different callers.
 479          */
 480         mounts_in_progress--;
 481         retval = 1;
 482 signaled:
 483         mutex_exit(&mount_lock);
 484         return (retval);
 485 }
 486 
 487 /*
 488  * The VFS layer may progress with new mounts as far as we're concerned.
 489  * Allow them to progress if we were the last obstacle.
 490  */
 491 static void
 492 resume_mounts(void)
 493 {
 494         mutex_enter(&mount_lock);
 495         if (++mounts_in_progress == 0)
 496                 cv_broadcast(&mount_cv);
 497         mutex_exit(&mount_lock);
 498 }
 499 
 500 /*
 501  * The VFS layer is busy with a mount; zones should wait until all
 502  * mounts are completed to progress.
 503  */
 504 void
 505 mount_in_progress(void)
 506 {
 507         mutex_enter(&mount_lock);
 508         while (mounts_in_progress < 0)
 509                 cv_wait(&mount_cv, &mount_lock);
 510         mounts_in_progress++;
 511         mutex_exit(&mount_lock);
 512 }
 513 
 514 /*
 515  * VFS is done with one mount; wake up any waiting block_mounts()
 516  * callers if this is the last mount.
 517  */
 518 void
 519 mount_completed(void)
 520 {
 521         mutex_enter(&mount_lock);
 522         if (--mounts_in_progress == 0)
 523                 cv_broadcast(&mount_cv);
 524         mutex_exit(&mount_lock);
 525 }
 526 
 527 /*
 528  * ZSD routines.
 529  *
 530  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 531  * defined by the pthread_key_create() and related interfaces.
 532  *
 533  * Kernel subsystems may register one or more data items and/or
 534  * callbacks to be executed when a zone is created, shutdown, or
 535  * destroyed.
 536  *
 537  * Unlike the thread counterpart, destructor callbacks will be executed
 538  * even if the data pointer is NULL and/or there are no constructor
 539  * callbacks, so it is the responsibility of such callbacks to check for
 540  * NULL data values if necessary.
 541  *
 542  * The locking strategy and overall picture is as follows:
 543  *
 544  * When someone calls zone_key_create(), a template ZSD entry is added to the
 545  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 546  * holding that lock all the existing zones are marked as
 547  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 548  * zone_zsd list (protected by zone_lock). The global list is updated first
 549  * (under zone_key_lock) to make sure that newly created zones use the
 550  * most recent list of keys. Then under zonehash_lock we walk the zones
 551  * and mark them.  Similar locking is used in zone_key_delete().
 552  *
 553  * The actual create, shutdown, and destroy callbacks are done without
 554  * holding any lock. And zsd_flags are used to ensure that the operations
 555  * completed so that when zone_key_create (and zone_create) is done, as well as
 556  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 557  * are completed.
 558  *
 559  * When new zones are created constructor callbacks for all registered ZSD
 560  * entries will be called. That also uses the above two phases of marking
 561  * what needs to be done, and then running the callbacks without holding
 562  * any locks.
 563  *
 564  * The framework does not provide any locking around zone_getspecific() and
 565  * zone_setspecific() apart from that needed for internal consistency, so
 566  * callers interested in atomic "test-and-set" semantics will need to provide
 567  * their own locking.
 568  */
 569 
 570 /*
 571  * Helper function to find the zsd_entry associated with the key in the
 572  * given list.
 573  */
 574 static struct zsd_entry *
 575 zsd_find(list_t *l, zone_key_t key)
 576 {
 577         struct zsd_entry *zsd;
 578 
 579         list_for_each(l, zsd) {
 580                 if (zsd->zsd_key == key) {
 581                         return (zsd);
 582                 }
 583         }
 584         return (NULL);
 585 }
 586 
 587 /*
 588  * Helper function to find the zsd_entry associated with the key in the
 589  * given list. Move it to the front of the list.
 590  */
 591 static struct zsd_entry *
 592 zsd_find_mru(list_t *l, zone_key_t key)
 593 {
 594         struct zsd_entry *zsd;
 595 
 596         list_for_each(l, zsd) {
 597                 if (zsd->zsd_key == key) {
 598                         /*
 599                          * Move to head of list to keep list in MRU order.
 600                          */
 601                         if (zsd != list_head(l)) {
 602                                 list_remove(l, zsd);
 603                                 list_insert_head(l, zsd);
 604                         }
 605                         return (zsd);
 606                 }
 607         }
 608         return (NULL);
 609 }
 610 
 611 void
 612 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 613     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 614 {
 615         struct zsd_entry *zsdp;
 616         struct zsd_entry *t;
 617         struct zone *zone;
 618         zone_key_t  key;
 619 
 620         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 621         zsdp->zsd_data = NULL;
 622         zsdp->zsd_create = create;
 623         zsdp->zsd_shutdown = shutdown;
 624         zsdp->zsd_destroy = destroy;
 625 
 626         /*
 627          * Insert in global list of callbacks. Makes future zone creations
 628          * see it.
 629          */
 630         mutex_enter(&zsd_key_lock);
 631         key = zsdp->zsd_key = ++zsd_keyval;
 632         ASSERT(zsd_keyval != 0);
 633         list_insert_tail(&zsd_registered_keys, zsdp);
 634         mutex_exit(&zsd_key_lock);
 635 
 636         /*
 637          * Insert for all existing zones and mark them as needing
 638          * a create callback.
 639          */
 640         mutex_enter(&zonehash_lock);        /* stop the world */
 641         list_for_each(&zone_active, zone) {
 642                 zone_status_t status;
 643 
 644                 mutex_enter(&zone->zone_lock);
 645 
 646                 /* Skip zones that are on the way down or not yet up */
 647                 status = zone_status_get(zone);
 648                 if (status >= ZONE_IS_DOWN ||
 649                     status == ZONE_IS_UNINITIALIZED) {
 650                         mutex_exit(&zone->zone_lock);
 651                         continue;
 652                 }
 653 
 654                 t = zsd_find_mru(&zone->zone_zsd, key);
 655                 if (t != NULL) {
 656                         /*
 657                          * A zsd_configure already inserted it after
 658                          * we dropped zsd_key_lock above.
 659                          */
 660                         mutex_exit(&zone->zone_lock);
 661                         continue;
 662                 }
 663                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 664                 t->zsd_key = key;
 665                 t->zsd_create = create;
 666                 t->zsd_shutdown = shutdown;
 667                 t->zsd_destroy = destroy;
 668                 if (create != NULL) {
 669                         t->zsd_flags = ZSD_CREATE_NEEDED;
 670                         DTRACE_PROBE2(zsd__create__needed,
 671                             zone_t *, zone, zone_key_t, key);
 672                 }
 673                 list_insert_tail(&zone->zone_zsd, t);
 674                 mutex_exit(&zone->zone_lock);
 675         }
 676         mutex_exit(&zonehash_lock);
 677 
 678         if (create != NULL) {
 679                 /* Now call the create callback for this key */
 680                 zsd_apply_all_zones(zsd_apply_create, key);
 681         }
 682         /*
 683          * It is safe for consumers to use the key now, make it
 684          * globally visible. Specifically zone_getspecific() will
 685          * always successfully return the zone specific data associated
 686          * with the key.
 687          */
 688         *keyp = key;
 689 
 690 }
 691 
 692 /*
 693  * Function called when a module is being unloaded, or otherwise wishes
 694  * to unregister its ZSD key and callbacks.
 695  *
 696  * Remove from the global list and determine the functions that need to
 697  * be called under a global lock. Then call the functions without
 698  * holding any locks. Finally free up the zone_zsd entries. (The apply
 699  * functions need to access the zone_zsd entries to find zsd_data etc.)
 700  */
 701 int
 702 zone_key_delete(zone_key_t key)
 703 {
 704         struct zsd_entry *zsdp = NULL;
 705         zone_t *zone;
 706 
 707         mutex_enter(&zsd_key_lock);
 708         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 709         if (zsdp == NULL) {
 710                 mutex_exit(&zsd_key_lock);
 711                 return (-1);
 712         }
 713         list_remove(&zsd_registered_keys, zsdp);
 714         mutex_exit(&zsd_key_lock);
 715 
 716         mutex_enter(&zonehash_lock);
 717         list_for_each(&zone_active, zone) {
 718                 struct zsd_entry *del;
 719 
 720                 mutex_enter(&zone->zone_lock);
 721                 del = zsd_find_mru(&zone->zone_zsd, key);
 722                 if (del == NULL) {
 723                         /*
 724                          * Somebody else got here first e.g the zone going
 725                          * away.
 726                          */
 727                         mutex_exit(&zone->zone_lock);
 728                         continue;
 729                 }
 730                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 731                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 732                 if (del->zsd_shutdown != NULL &&
 733                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 734                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 735                         DTRACE_PROBE2(zsd__shutdown__needed,
 736                             zone_t *, zone, zone_key_t, key);
 737                 }
 738                 if (del->zsd_destroy != NULL &&
 739                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 740                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 741                         DTRACE_PROBE2(zsd__destroy__needed,
 742                             zone_t *, zone, zone_key_t, key);
 743                 }
 744                 mutex_exit(&zone->zone_lock);
 745         }
 746         mutex_exit(&zonehash_lock);
 747         kmem_free(zsdp, sizeof (*zsdp));
 748 
 749         /* Now call the shutdown and destroy callback for this key */
 750         zsd_apply_all_zones(zsd_apply_shutdown, key);
 751         zsd_apply_all_zones(zsd_apply_destroy, key);
 752 
 753         /* Now we can free up the zsdp structures in each zone */
 754         mutex_enter(&zonehash_lock);
 755         list_for_each(&zone_active, zone) {
 756                 struct zsd_entry *del;
 757 
 758                 mutex_enter(&zone->zone_lock);
 759                 del = zsd_find(&zone->zone_zsd, key);
 760                 if (del != NULL) {
 761                         list_remove(&zone->zone_zsd, del);
 762                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 763                         kmem_free(del, sizeof (*del));
 764                 }
 765                 mutex_exit(&zone->zone_lock);
 766         }
 767         mutex_exit(&zonehash_lock);
 768 
 769         return (0);
 770 }
 771 
 772 /*
 773  * ZSD counterpart of pthread_setspecific().
 774  *
 775  * Since all zsd callbacks, including those with no create function,
 776  * have an entry in zone_zsd, if the key is registered it is part of
 777  * the zone_zsd list.
 778  * Return an error if the key wasn't registerd.
 779  */
 780 int
 781 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 782 {
 783         struct zsd_entry *t;
 784 
 785         mutex_enter(&zone->zone_lock);
 786         t = zsd_find_mru(&zone->zone_zsd, key);
 787         if (t != NULL) {
 788                 /*
 789                  * Replace old value with new
 790                  */
 791                 t->zsd_data = (void *)data;
 792                 mutex_exit(&zone->zone_lock);
 793                 return (0);
 794         }
 795         mutex_exit(&zone->zone_lock);
 796         return (-1);
 797 }
 798 
 799 /*
 800  * ZSD counterpart of pthread_getspecific().
 801  */
 802 void *
 803 zone_getspecific(zone_key_t key, zone_t *zone)
 804 {
 805         struct zsd_entry *t;
 806         void *data;
 807 
 808         mutex_enter(&zone->zone_lock);
 809         t = zsd_find_mru(&zone->zone_zsd, key);
 810         data = (t == NULL ? NULL : t->zsd_data);
 811         mutex_exit(&zone->zone_lock);
 812         return (data);
 813 }
 814 
 815 /*
 816  * Function used to initialize a zone's list of ZSD callbacks and data
 817  * when the zone is being created.  The callbacks are initialized from
 818  * the template list (zsd_registered_keys). The constructor callback is
 819  * executed later (once the zone exists and with locks dropped).
 820  */
 821 static void
 822 zone_zsd_configure(zone_t *zone)
 823 {
 824         struct zsd_entry *zsdp;
 825         struct zsd_entry *t;
 826 
 827         ASSERT(MUTEX_HELD(&zonehash_lock));
 828         ASSERT(list_head(&zone->zone_zsd) == NULL);
 829         mutex_enter(&zone->zone_lock);
 830         mutex_enter(&zsd_key_lock);
 831         list_for_each(&zsd_registered_keys, zsdp) {
 832                 /*
 833                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 834                  * should not have added anything to it.
 835                  */
 836                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 837 
 838                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 839                 t->zsd_key = zsdp->zsd_key;
 840                 t->zsd_create = zsdp->zsd_create;
 841                 t->zsd_shutdown = zsdp->zsd_shutdown;
 842                 t->zsd_destroy = zsdp->zsd_destroy;
 843                 if (zsdp->zsd_create != NULL) {
 844                         t->zsd_flags = ZSD_CREATE_NEEDED;
 845                         DTRACE_PROBE2(zsd__create__needed,
 846                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 847                 }
 848                 list_insert_tail(&zone->zone_zsd, t);
 849         }
 850         mutex_exit(&zsd_key_lock);
 851         mutex_exit(&zone->zone_lock);
 852 }
 853 
 854 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 855 
 856 /*
 857  * Helper function to execute shutdown or destructor callbacks.
 858  */
 859 static void
 860 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 861 {
 862         struct zsd_entry *t;
 863 
 864         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 865         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 866         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 867 
 868         /*
 869          * Run the callback solely based on what is registered for the zone
 870          * in zone_zsd. The global list can change independently of this
 871          * as keys are registered and unregistered and we don't register new
 872          * callbacks for a zone that is in the process of going away.
 873          */
 874         mutex_enter(&zone->zone_lock);
 875         list_for_each(&zone->zone_zsd, t) {
 876                 zone_key_t key = t->zsd_key;
 877 
 878                 /* Skip if no callbacks registered */
 879 
 880                 if (ct == ZSD_SHUTDOWN) {
 881                         if (t->zsd_shutdown != NULL &&
 882                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 883                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 884                                 DTRACE_PROBE2(zsd__shutdown__needed,
 885                                     zone_t *, zone, zone_key_t, key);
 886                         }
 887                 } else {
 888                         if (t->zsd_destroy != NULL &&
 889                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 890                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 891                                 DTRACE_PROBE2(zsd__destroy__needed,
 892                                     zone_t *, zone, zone_key_t, key);
 893                         }
 894                 }
 895         }
 896         mutex_exit(&zone->zone_lock);
 897 
 898         /* Now call the shutdown and destroy callback for this key */
 899         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 900         zsd_apply_all_keys(zsd_apply_destroy, zone);
 901 
 902 }
 903 
 904 /*
 905  * Called when the zone is going away; free ZSD-related memory, and
 906  * destroy the zone_zsd list.
 907  */
 908 static void
 909 zone_free_zsd(zone_t *zone)
 910 {
 911         struct zsd_entry *t, *next;
 912 
 913         /*
 914          * Free all the zsd_entry's we had on this zone.
 915          */
 916         mutex_enter(&zone->zone_lock);
 917         list_for_each_safe(&zone->zone_zsd, t, next) {
 918                 list_remove(&zone->zone_zsd, t);
 919                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 920                 kmem_free(t, sizeof (*t));
 921         }
 922         list_destroy(&zone->zone_zsd);
 923         mutex_exit(&zone->zone_lock);
 924 
 925 }
 926 
 927 /*
 928  * Apply a function to all zones for particular key value.
 929  *
 930  * The applyfn has to drop zonehash_lock if it does some work, and
 931  * then reacquire it before it returns.
 932  * When the lock is dropped we don't follow list_next even
 933  * if it is possible to do so without any hazards. This is
 934  * because we want the design to allow for the list of zones
 935  * to change in any arbitrary way during the time the
 936  * lock was dropped.
 937  *
 938  * It is safe to restart the loop at list_head since the applyfn
 939  * changes the zsd_flags as it does work, so a subsequent
 940  * pass through will have no effect in applyfn, hence the loop will terminate
 941  * in at worst O(N^2).
 942  */
 943 static void
 944 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 945 {
 946         zone_t *zone;
 947 
 948         mutex_enter(&zonehash_lock);
 949         zone = list_head(&zone_active);
 950         while (zone != NULL) {
 951                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 952                         /* Lock dropped - restart at head */
 953                         zone = list_head(&zone_active);
 954                 } else {
 955                         zone = list_next(&zone_active, zone);
 956                 }
 957         }
 958         mutex_exit(&zonehash_lock);
 959 }
 960 
 961 /*
 962  * Apply a function to all keys for a particular zone.
 963  *
 964  * The applyfn has to drop zonehash_lock if it does some work, and
 965  * then reacquire it before it returns.
 966  * When the lock is dropped we don't follow list_next even
 967  * if it is possible to do so without any hazards. This is
 968  * because we want the design to allow for the list of zsd callbacks
 969  * to change in any arbitrary way during the time the
 970  * lock was dropped.
 971  *
 972  * It is safe to restart the loop at list_head since the applyfn
 973  * changes the zsd_flags as it does work, so a subsequent
 974  * pass through will have no effect in applyfn, hence the loop will terminate
 975  * in at worst O(N^2).
 976  */
 977 static void
 978 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 979 {
 980         struct zsd_entry *t;
 981 
 982         mutex_enter(&zone->zone_lock);
 983         t = list_head(&zone->zone_zsd);
 984         while (t != NULL) {
 985                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 986                         /* Lock dropped - restart at head */
 987                         t = list_head(&zone->zone_zsd);
 988                 } else {
 989                         t = list_next(&zone->zone_zsd, t);
 990                 }
 991         }
 992         mutex_exit(&zone->zone_lock);
 993 }
 994 
 995 /*
 996  * Call the create function for the zone and key if CREATE_NEEDED
 997  * is set.
 998  * If some other thread gets here first and sets CREATE_INPROGRESS, then
 999  * we wait for that thread to complete so that we can ensure that
1000  * all the callbacks are done when we've looped over all zones/keys.
1001  *
1002  * When we call the create function, we drop the global held by the
1003  * caller, and return true to tell the caller it needs to re-evalute the
1004  * state.
1005  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1006  * remains held on exit.
1007  */
1008 static boolean_t
1009 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1010     zone_t *zone, zone_key_t key)
1011 {
1012         void *result;
1013         struct zsd_entry *t;
1014         boolean_t dropped;
1015 
1016         if (lockp != NULL) {
1017                 ASSERT(MUTEX_HELD(lockp));
1018         }
1019         if (zone_lock_held) {
1020                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1021         } else {
1022                 mutex_enter(&zone->zone_lock);
1023         }
1024 
1025         t = zsd_find(&zone->zone_zsd, key);
1026         if (t == NULL) {
1027                 /*
1028                  * Somebody else got here first e.g the zone going
1029                  * away.
1030                  */
1031                 if (!zone_lock_held)
1032                         mutex_exit(&zone->zone_lock);
1033                 return (B_FALSE);
1034         }
1035         dropped = B_FALSE;
1036         if (zsd_wait_for_inprogress(zone, t, lockp))
1037                 dropped = B_TRUE;
1038 
1039         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1040                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1041                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1042                 DTRACE_PROBE2(zsd__create__inprogress,
1043                     zone_t *, zone, zone_key_t, key);
1044                 mutex_exit(&zone->zone_lock);
1045                 if (lockp != NULL)
1046                         mutex_exit(lockp);
1047 
1048                 dropped = B_TRUE;
1049                 ASSERT(t->zsd_create != NULL);
1050                 DTRACE_PROBE2(zsd__create__start,
1051                     zone_t *, zone, zone_key_t, key);
1052 
1053                 result = (*t->zsd_create)(zone->zone_id);
1054 
1055                 DTRACE_PROBE2(zsd__create__end,
1056                     zone_t *, zone, voidn *, result);
1057 
1058                 ASSERT(result != NULL);
1059                 if (lockp != NULL)
1060                         mutex_enter(lockp);
1061                 mutex_enter(&zone->zone_lock);
1062                 t->zsd_data = result;
1063                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1064                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1065                 cv_broadcast(&t->zsd_cv);
1066                 DTRACE_PROBE2(zsd__create__completed,
1067                     zone_t *, zone, zone_key_t, key);
1068         }
1069         if (!zone_lock_held)
1070                 mutex_exit(&zone->zone_lock);
1071         return (dropped);
1072 }
1073 
1074 /*
1075  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1076  * is set.
1077  * If some other thread gets here first and sets *_INPROGRESS, then
1078  * we wait for that thread to complete so that we can ensure that
1079  * all the callbacks are done when we've looped over all zones/keys.
1080  *
1081  * When we call the shutdown function, we drop the global held by the
1082  * caller, and return true to tell the caller it needs to re-evalute the
1083  * state.
1084  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1085  * remains held on exit.
1086  */
1087 static boolean_t
1088 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1089     zone_t *zone, zone_key_t key)
1090 {
1091         struct zsd_entry *t;
1092         void *data;
1093         boolean_t dropped;
1094 
1095         if (lockp != NULL) {
1096                 ASSERT(MUTEX_HELD(lockp));
1097         }
1098         if (zone_lock_held) {
1099                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1100         } else {
1101                 mutex_enter(&zone->zone_lock);
1102         }
1103 
1104         t = zsd_find(&zone->zone_zsd, key);
1105         if (t == NULL) {
1106                 /*
1107                  * Somebody else got here first e.g the zone going
1108                  * away.
1109                  */
1110                 if (!zone_lock_held)
1111                         mutex_exit(&zone->zone_lock);
1112                 return (B_FALSE);
1113         }
1114         dropped = B_FALSE;
1115         if (zsd_wait_for_creator(zone, t, lockp))
1116                 dropped = B_TRUE;
1117 
1118         if (zsd_wait_for_inprogress(zone, t, lockp))
1119                 dropped = B_TRUE;
1120 
1121         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1122                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1123                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1124                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1125                     zone_t *, zone, zone_key_t, key);
1126                 mutex_exit(&zone->zone_lock);
1127                 if (lockp != NULL)
1128                         mutex_exit(lockp);
1129                 dropped = B_TRUE;
1130 
1131                 ASSERT(t->zsd_shutdown != NULL);
1132                 data = t->zsd_data;
1133 
1134                 DTRACE_PROBE2(zsd__shutdown__start,
1135                     zone_t *, zone, zone_key_t, key);
1136 
1137                 (t->zsd_shutdown)(zone->zone_id, data);
1138                 DTRACE_PROBE2(zsd__shutdown__end,
1139                     zone_t *, zone, zone_key_t, key);
1140 
1141                 if (lockp != NULL)
1142                         mutex_enter(lockp);
1143                 mutex_enter(&zone->zone_lock);
1144                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1145                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1146                 cv_broadcast(&t->zsd_cv);
1147                 DTRACE_PROBE2(zsd__shutdown__completed,
1148                     zone_t *, zone, zone_key_t, key);
1149         }
1150         if (!zone_lock_held)
1151                 mutex_exit(&zone->zone_lock);
1152         return (dropped);
1153 }
1154 
1155 /*
1156  * Call the destroy function for the zone and key if DESTROY_NEEDED
1157  * is set.
1158  * If some other thread gets here first and sets *_INPROGRESS, then
1159  * we wait for that thread to complete so that we can ensure that
1160  * all the callbacks are done when we've looped over all zones/keys.
1161  *
1162  * When we call the destroy function, we drop the global held by the
1163  * caller, and return true to tell the caller it needs to re-evalute the
1164  * state.
1165  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1166  * remains held on exit.
1167  */
1168 static boolean_t
1169 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1170     zone_t *zone, zone_key_t key)
1171 {
1172         struct zsd_entry *t;
1173         void *data;
1174         boolean_t dropped;
1175 
1176         if (lockp != NULL) {
1177                 ASSERT(MUTEX_HELD(lockp));
1178         }
1179         if (zone_lock_held) {
1180                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1181         } else {
1182                 mutex_enter(&zone->zone_lock);
1183         }
1184 
1185         t = zsd_find(&zone->zone_zsd, key);
1186         if (t == NULL) {
1187                 /*
1188                  * Somebody else got here first e.g the zone going
1189                  * away.
1190                  */
1191                 if (!zone_lock_held)
1192                         mutex_exit(&zone->zone_lock);
1193                 return (B_FALSE);
1194         }
1195         dropped = B_FALSE;
1196         if (zsd_wait_for_creator(zone, t, lockp))
1197                 dropped = B_TRUE;
1198 
1199         if (zsd_wait_for_inprogress(zone, t, lockp))
1200                 dropped = B_TRUE;
1201 
1202         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1203                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1204                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1205                 DTRACE_PROBE2(zsd__destroy__inprogress,
1206                     zone_t *, zone, zone_key_t, key);
1207                 mutex_exit(&zone->zone_lock);
1208                 if (lockp != NULL)
1209                         mutex_exit(lockp);
1210                 dropped = B_TRUE;
1211 
1212                 ASSERT(t->zsd_destroy != NULL);
1213                 data = t->zsd_data;
1214                 DTRACE_PROBE2(zsd__destroy__start,
1215                     zone_t *, zone, zone_key_t, key);
1216 
1217                 (t->zsd_destroy)(zone->zone_id, data);
1218                 DTRACE_PROBE2(zsd__destroy__end,
1219                     zone_t *, zone, zone_key_t, key);
1220 
1221                 if (lockp != NULL)
1222                         mutex_enter(lockp);
1223                 mutex_enter(&zone->zone_lock);
1224                 t->zsd_data = NULL;
1225                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1226                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1227                 cv_broadcast(&t->zsd_cv);
1228                 DTRACE_PROBE2(zsd__destroy__completed,
1229                     zone_t *, zone, zone_key_t, key);
1230         }
1231         if (!zone_lock_held)
1232                 mutex_exit(&zone->zone_lock);
1233         return (dropped);
1234 }
1235 
1236 /*
1237  * Wait for any CREATE_NEEDED flag to be cleared.
1238  * Returns true if lockp was temporarily dropped while waiting.
1239  */
1240 static boolean_t
1241 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1242 {
1243         boolean_t dropped = B_FALSE;
1244 
1245         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1246                 DTRACE_PROBE2(zsd__wait__for__creator,
1247                     zone_t *, zone, struct zsd_entry *, t);
1248                 if (lockp != NULL) {
1249                         dropped = B_TRUE;
1250                         mutex_exit(lockp);
1251                 }
1252                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1253                 if (lockp != NULL) {
1254                         /* First drop zone_lock to preserve order */
1255                         mutex_exit(&zone->zone_lock);
1256                         mutex_enter(lockp);
1257                         mutex_enter(&zone->zone_lock);
1258                 }
1259         }
1260         return (dropped);
1261 }
1262 
1263 /*
1264  * Wait for any INPROGRESS flag to be cleared.
1265  * Returns true if lockp was temporarily dropped while waiting.
1266  */
1267 static boolean_t
1268 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1269 {
1270         boolean_t dropped = B_FALSE;
1271 
1272         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1273                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1274                     zone_t *, zone, struct zsd_entry *, t);
1275                 if (lockp != NULL) {
1276                         dropped = B_TRUE;
1277                         mutex_exit(lockp);
1278                 }
1279                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1280                 if (lockp != NULL) {
1281                         /* First drop zone_lock to preserve order */
1282                         mutex_exit(&zone->zone_lock);
1283                         mutex_enter(lockp);
1284                         mutex_enter(&zone->zone_lock);
1285                 }
1286         }
1287         return (dropped);
1288 }
1289 
1290 /*
1291  * Frees memory associated with the zone dataset list.
1292  */
1293 static void
1294 zone_free_datasets(zone_t *zone)
1295 {
1296         zone_dataset_t *t, *next;
1297 
1298         list_for_each_safe(&zone->zone_datasets, t, next) {
1299                 list_remove(&zone->zone_datasets, t);
1300                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1301                 kmem_free(t, sizeof (*t));
1302         }
1303         list_destroy(&zone->zone_datasets);
1304 }
1305 
1306 /*
1307  * zone.cpu-shares resource control support.
1308  */
1309 /*ARGSUSED*/
1310 static rctl_qty_t
1311 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1312 {
1313         ASSERT(MUTEX_HELD(&p->p_lock));
1314         return (p->p_zone->zone_shares);
1315 }
1316 
1317 /*ARGSUSED*/
1318 static int
1319 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1320     rctl_qty_t nv)
1321 {
1322         ASSERT(MUTEX_HELD(&p->p_lock));
1323         ASSERT(e->rcep_t == RCENTITY_ZONE);
1324         if (e->rcep_p.zone == NULL)
1325                 return (0);
1326 
1327         e->rcep_p.zone->zone_shares = nv;
1328         return (0);
1329 }
1330 
1331 static rctl_ops_t zone_cpu_shares_ops = {
1332         rcop_no_action,
1333         zone_cpu_shares_usage,
1334         zone_cpu_shares_set,
1335         rcop_no_test
1336 };
1337 
1338 /*
1339  * zone.cpu-cap resource control support.
1340  */
1341 /*ARGSUSED*/
1342 static rctl_qty_t
1343 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1344 {
1345         ASSERT(MUTEX_HELD(&p->p_lock));
1346         return (cpucaps_zone_get(p->p_zone));
1347 }
1348 
1349 /*ARGSUSED*/
1350 static int
1351 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1352     rctl_qty_t nv)
1353 {
1354         zone_t *zone = e->rcep_p.zone;
1355 
1356         ASSERT(MUTEX_HELD(&p->p_lock));
1357         ASSERT(e->rcep_t == RCENTITY_ZONE);
1358 
1359         if (zone == NULL)
1360                 return (0);
1361 
1362         /*
1363          * set cap to the new value.
1364          */
1365         return (cpucaps_zone_set(zone, nv));
1366 }
1367 
1368 static rctl_ops_t zone_cpu_cap_ops = {
1369         rcop_no_action,
1370         zone_cpu_cap_get,
1371         zone_cpu_cap_set,
1372         rcop_no_test
1373 };
1374 
1375 /*ARGSUSED*/
1376 static rctl_qty_t
1377 zone_lwps_usage(rctl_t *r, proc_t *p)
1378 {
1379         rctl_qty_t nlwps;
1380         zone_t *zone = p->p_zone;
1381 
1382         ASSERT(MUTEX_HELD(&p->p_lock));
1383 
1384         mutex_enter(&zone->zone_nlwps_lock);
1385         nlwps = zone->zone_nlwps;
1386         mutex_exit(&zone->zone_nlwps_lock);
1387 
1388         return (nlwps);
1389 }
1390 
1391 /*ARGSUSED*/
1392 static int
1393 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1394     rctl_qty_t incr, uint_t flags)
1395 {
1396         rctl_qty_t nlwps;
1397 
1398         ASSERT(MUTEX_HELD(&p->p_lock));
1399         ASSERT(e->rcep_t == RCENTITY_ZONE);
1400         if (e->rcep_p.zone == NULL)
1401                 return (0);
1402         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1403         nlwps = e->rcep_p.zone->zone_nlwps;
1404 
1405         if (nlwps + incr > rcntl->rcv_value)
1406                 return (1);
1407 
1408         return (0);
1409 }
1410 
1411 /*ARGSUSED*/
1412 static int
1413 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1414 {
1415         ASSERT(MUTEX_HELD(&p->p_lock));
1416         ASSERT(e->rcep_t == RCENTITY_ZONE);
1417         if (e->rcep_p.zone == NULL)
1418                 return (0);
1419         e->rcep_p.zone->zone_nlwps_ctl = nv;
1420         return (0);
1421 }
1422 
1423 static rctl_ops_t zone_lwps_ops = {
1424         rcop_no_action,
1425         zone_lwps_usage,
1426         zone_lwps_set,
1427         zone_lwps_test,
1428 };
1429 
1430 /*ARGSUSED*/
1431 static rctl_qty_t
1432 zone_procs_usage(rctl_t *r, proc_t *p)
1433 {
1434         rctl_qty_t nprocs;
1435         zone_t *zone = p->p_zone;
1436 
1437         ASSERT(MUTEX_HELD(&p->p_lock));
1438 
1439         mutex_enter(&zone->zone_nlwps_lock);
1440         nprocs = zone->zone_nprocs;
1441         mutex_exit(&zone->zone_nlwps_lock);
1442 
1443         return (nprocs);
1444 }
1445 
1446 /*ARGSUSED*/
1447 static int
1448 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1449     rctl_qty_t incr, uint_t flags)
1450 {
1451         rctl_qty_t nprocs;
1452 
1453         ASSERT(MUTEX_HELD(&p->p_lock));
1454         ASSERT(e->rcep_t == RCENTITY_ZONE);
1455         if (e->rcep_p.zone == NULL)
1456                 return (0);
1457         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1458         nprocs = e->rcep_p.zone->zone_nprocs;
1459 
1460         if (nprocs + incr > rcntl->rcv_value)
1461                 return (1);
1462 
1463         return (0);
1464 }
1465 
1466 /*ARGSUSED*/
1467 static int
1468 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1469 {
1470         ASSERT(MUTEX_HELD(&p->p_lock));
1471         ASSERT(e->rcep_t == RCENTITY_ZONE);
1472         if (e->rcep_p.zone == NULL)
1473                 return (0);
1474         e->rcep_p.zone->zone_nprocs_ctl = nv;
1475         return (0);
1476 }
1477 
1478 static rctl_ops_t zone_procs_ops = {
1479         rcop_no_action,
1480         zone_procs_usage,
1481         zone_procs_set,
1482         zone_procs_test,
1483 };
1484 
1485 /*ARGSUSED*/
1486 static int
1487 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1488     rctl_qty_t incr, uint_t flags)
1489 {
1490         rctl_qty_t v;
1491         ASSERT(MUTEX_HELD(&p->p_lock));
1492         ASSERT(e->rcep_t == RCENTITY_ZONE);
1493         v = e->rcep_p.zone->zone_shmmax + incr;
1494         if (v > rval->rcv_value)
1495                 return (1);
1496         return (0);
1497 }
1498 
1499 static rctl_ops_t zone_shmmax_ops = {
1500         rcop_no_action,
1501         rcop_no_usage,
1502         rcop_no_set,
1503         zone_shmmax_test
1504 };
1505 
1506 /*ARGSUSED*/
1507 static int
1508 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1509     rctl_qty_t incr, uint_t flags)
1510 {
1511         rctl_qty_t v;
1512         ASSERT(MUTEX_HELD(&p->p_lock));
1513         ASSERT(e->rcep_t == RCENTITY_ZONE);
1514         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1515         if (v > rval->rcv_value)
1516                 return (1);
1517         return (0);
1518 }
1519 
1520 static rctl_ops_t zone_shmmni_ops = {
1521         rcop_no_action,
1522         rcop_no_usage,
1523         rcop_no_set,
1524         zone_shmmni_test
1525 };
1526 
1527 /*ARGSUSED*/
1528 static int
1529 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1530     rctl_qty_t incr, uint_t flags)
1531 {
1532         rctl_qty_t v;
1533         ASSERT(MUTEX_HELD(&p->p_lock));
1534         ASSERT(e->rcep_t == RCENTITY_ZONE);
1535         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1536         if (v > rval->rcv_value)
1537                 return (1);
1538         return (0);
1539 }
1540 
1541 static rctl_ops_t zone_semmni_ops = {
1542         rcop_no_action,
1543         rcop_no_usage,
1544         rcop_no_set,
1545         zone_semmni_test
1546 };
1547 
1548 /*ARGSUSED*/
1549 static int
1550 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1551     rctl_qty_t incr, uint_t flags)
1552 {
1553         rctl_qty_t v;
1554         ASSERT(MUTEX_HELD(&p->p_lock));
1555         ASSERT(e->rcep_t == RCENTITY_ZONE);
1556         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1557         if (v > rval->rcv_value)
1558                 return (1);
1559         return (0);
1560 }
1561 
1562 static rctl_ops_t zone_msgmni_ops = {
1563         rcop_no_action,
1564         rcop_no_usage,
1565         rcop_no_set,
1566         zone_msgmni_test
1567 };
1568 
1569 /*ARGSUSED*/
1570 static rctl_qty_t
1571 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1572 {
1573         rctl_qty_t q;
1574         ASSERT(MUTEX_HELD(&p->p_lock));
1575         mutex_enter(&p->p_zone->zone_mem_lock);
1576         q = p->p_zone->zone_locked_mem;
1577         mutex_exit(&p->p_zone->zone_mem_lock);
1578         return (q);
1579 }
1580 
1581 /*ARGSUSED*/
1582 static int
1583 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1584     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1585 {
1586         rctl_qty_t q;
1587         zone_t *z;
1588 
1589         z = e->rcep_p.zone;
1590         ASSERT(MUTEX_HELD(&p->p_lock));
1591         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1592         q = z->zone_locked_mem;
1593         if (q + incr > rcntl->rcv_value)
1594                 return (1);
1595         return (0);
1596 }
1597 
1598 /*ARGSUSED*/
1599 static int
1600 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1601     rctl_qty_t nv)
1602 {
1603         ASSERT(MUTEX_HELD(&p->p_lock));
1604         ASSERT(e->rcep_t == RCENTITY_ZONE);
1605         if (e->rcep_p.zone == NULL)
1606                 return (0);
1607         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1608         return (0);
1609 }
1610 
1611 static rctl_ops_t zone_locked_mem_ops = {
1612         rcop_no_action,
1613         zone_locked_mem_usage,
1614         zone_locked_mem_set,
1615         zone_locked_mem_test
1616 };
1617 
1618 /*ARGSUSED*/
1619 static rctl_qty_t
1620 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1621 {
1622         rctl_qty_t q;
1623         zone_t *z = p->p_zone;
1624 
1625         ASSERT(MUTEX_HELD(&p->p_lock));
1626         mutex_enter(&z->zone_mem_lock);
1627         q = z->zone_max_swap;
1628         mutex_exit(&z->zone_mem_lock);
1629         return (q);
1630 }
1631 
1632 /*ARGSUSED*/
1633 static int
1634 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1635     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1636 {
1637         rctl_qty_t q;
1638         zone_t *z;
1639 
1640         z = e->rcep_p.zone;
1641         ASSERT(MUTEX_HELD(&p->p_lock));
1642         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1643         q = z->zone_max_swap;
1644         if (q + incr > rcntl->rcv_value)
1645                 return (1);
1646         return (0);
1647 }
1648 
1649 /*ARGSUSED*/
1650 static int
1651 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1652     rctl_qty_t nv)
1653 {
1654         ASSERT(MUTEX_HELD(&p->p_lock));
1655         ASSERT(e->rcep_t == RCENTITY_ZONE);
1656         if (e->rcep_p.zone == NULL)
1657                 return (0);
1658         e->rcep_p.zone->zone_max_swap_ctl = nv;
1659         return (0);
1660 }
1661 
1662 static rctl_ops_t zone_max_swap_ops = {
1663         rcop_no_action,
1664         zone_max_swap_usage,
1665         zone_max_swap_set,
1666         zone_max_swap_test
1667 };
1668 
1669 /*ARGSUSED*/
1670 static rctl_qty_t
1671 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1672 {
1673         rctl_qty_t q;
1674         zone_t *z = p->p_zone;
1675 
1676         ASSERT(MUTEX_HELD(&p->p_lock));
1677         mutex_enter(&z->zone_rctl_lock);
1678         q = z->zone_max_lofi;
1679         mutex_exit(&z->zone_rctl_lock);
1680         return (q);
1681 }
1682 
1683 /*ARGSUSED*/
1684 static int
1685 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1686     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1687 {
1688         rctl_qty_t q;
1689         zone_t *z;
1690 
1691         z = e->rcep_p.zone;
1692         ASSERT(MUTEX_HELD(&p->p_lock));
1693         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1694         q = z->zone_max_lofi;
1695         if (q + incr > rcntl->rcv_value)
1696                 return (1);
1697         return (0);
1698 }
1699 
1700 /*ARGSUSED*/
1701 static int
1702 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1703     rctl_qty_t nv)
1704 {
1705         ASSERT(MUTEX_HELD(&p->p_lock));
1706         ASSERT(e->rcep_t == RCENTITY_ZONE);
1707         if (e->rcep_p.zone == NULL)
1708                 return (0);
1709         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1710         return (0);
1711 }
1712 
1713 static rctl_ops_t zone_max_lofi_ops = {
1714         rcop_no_action,
1715         zone_max_lofi_usage,
1716         zone_max_lofi_set,
1717         zone_max_lofi_test
1718 };
1719 
1720 /*
1721  * Helper function to brand the zone with a unique ID.
1722  */
1723 static void
1724 zone_uniqid(zone_t *zone)
1725 {
1726         static uint64_t uniqid = 0;
1727 
1728         ASSERT(MUTEX_HELD(&zonehash_lock));
1729         zone->zone_uniqid = uniqid++;
1730 }
1731 
1732 /*
1733  * Returns a held pointer to the "kcred" for the specified zone.
1734  */
1735 struct cred *
1736 zone_get_kcred(zoneid_t zoneid)
1737 {
1738         zone_t *zone;
1739         cred_t *cr;
1740 
1741         if ((zone = zone_find_by_id(zoneid)) == NULL)
1742                 return (NULL);
1743         cr = zone->zone_kcred;
1744         crhold(cr);
1745         zone_rele(zone);
1746         return (cr);
1747 }
1748 
1749 static int
1750 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1751 {
1752         zone_t *zone = ksp->ks_private;
1753         zone_kstat_t *zk = ksp->ks_data;
1754 
1755         if (rw == KSTAT_WRITE)
1756                 return (EACCES);
1757 
1758         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1759         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1760         return (0);
1761 }
1762 
1763 static int
1764 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1765 {
1766         zone_t *zone = ksp->ks_private;
1767         zone_kstat_t *zk = ksp->ks_data;
1768 
1769         if (rw == KSTAT_WRITE)
1770                 return (EACCES);
1771 
1772         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1773         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1774         return (0);
1775 }
1776 
1777 static int
1778 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1779 {
1780         zone_t *zone = ksp->ks_private;
1781         zone_kstat_t *zk = ksp->ks_data;
1782 
1783         if (rw == KSTAT_WRITE)
1784                 return (EACCES);
1785 
1786         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1787         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1788         return (0);
1789 }
1790 
1791 static kstat_t *
1792 zone_kstat_create_common(zone_t *zone, char *name,
1793     int (*updatefunc) (kstat_t *, int))
1794 {
1795         kstat_t *ksp;
1796         zone_kstat_t *zk;
1797 
1798         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1799             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1800             KSTAT_FLAG_VIRTUAL);
1801 
1802         if (ksp == NULL)
1803                 return (NULL);
1804 
1805         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1806         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1807         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1808         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1809         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1810         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1811         ksp->ks_update = updatefunc;
1812         ksp->ks_private = zone;
1813         kstat_install(ksp);
1814         return (ksp);
1815 }
1816 
1817 static int
1818 zone_misc_kstat_update(kstat_t *ksp, int rw)
1819 {
1820         zone_t *zone = ksp->ks_private;
1821         zone_misc_kstat_t *zmp = ksp->ks_data;
1822         hrtime_t tmp;
1823 
1824         if (rw == KSTAT_WRITE)
1825                 return (EACCES);
1826 
1827         tmp = zone->zone_utime;
1828         scalehrtime(&tmp);
1829         zmp->zm_utime.value.ui64 = tmp;
1830         tmp = zone->zone_stime;
1831         scalehrtime(&tmp);
1832         zmp->zm_stime.value.ui64 = tmp;
1833         tmp = zone->zone_wtime;
1834         scalehrtime(&tmp);
1835         zmp->zm_wtime.value.ui64 = tmp;
1836 
1837         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1838         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1839         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1840 
1841         return (0);
1842 }
1843 
1844 static kstat_t *
1845 zone_misc_kstat_create(zone_t *zone)
1846 {
1847         kstat_t *ksp;
1848         zone_misc_kstat_t *zmp;
1849 
1850         if ((ksp = kstat_create_zone("zones", zone->zone_id,
1851             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1852             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1853             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1854                 return (NULL);
1855 
1856         if (zone->zone_id != GLOBAL_ZONEID)
1857                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1858 
1859         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1860         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1861         ksp->ks_lock = &zone->zone_misc_lock;
1862         zone->zone_misc_stats = zmp;
1863 
1864         /* The kstat "name" field is not large enough for a full zonename */
1865         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1866         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1867         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1868         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1869         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1870         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1871         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1872         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1873             KSTAT_DATA_UINT32);
1874 
1875         ksp->ks_update = zone_misc_kstat_update;
1876         ksp->ks_private = zone;
1877 
1878         kstat_install(ksp);
1879         return (ksp);
1880 }
1881 
1882 static void
1883 zone_kstat_create(zone_t *zone)
1884 {
1885         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1886             "lockedmem", zone_lockedmem_kstat_update);
1887         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
1888             "swapresv", zone_swapresv_kstat_update);
1889         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
1890             "nprocs", zone_nprocs_kstat_update);
1891 
1892         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
1893                 zone->zone_misc_stats = kmem_zalloc(
1894                     sizeof (zone_misc_kstat_t), KM_SLEEP);
1895         }
1896 }
1897 
1898 static void
1899 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
1900 {
1901         void *data;
1902 
1903         if (*pkstat != NULL) {
1904                 data = (*pkstat)->ks_data;
1905                 kstat_delete(*pkstat);
1906                 kmem_free(data, datasz);
1907                 *pkstat = NULL;
1908         }
1909 }
1910 
1911 static void
1912 zone_kstat_delete(zone_t *zone)
1913 {
1914         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
1915             sizeof (zone_kstat_t));
1916         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
1917             sizeof (zone_kstat_t));
1918         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
1919             sizeof (zone_kstat_t));
1920         zone_kstat_delete_common(&zone->zone_misc_ksp,
1921             sizeof (zone_misc_kstat_t));
1922 }
1923 
1924 /*
1925  * Called very early on in boot to initialize the ZSD list so that
1926  * zone_key_create() can be called before zone_init().  It also initializes
1927  * portions of zone0 which may be used before zone_init() is called.  The
1928  * variable "global_zone" will be set when zone0 is fully initialized by
1929  * zone_init().
1930  */
1931 void
1932 zone_zsd_init(void)
1933 {
1934         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
1935         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
1936         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
1937             offsetof(struct zsd_entry, zsd_linkage));
1938         list_create(&zone_active, sizeof (zone_t),
1939             offsetof(zone_t, zone_linkage));
1940         list_create(&zone_deathrow, sizeof (zone_t),
1941             offsetof(zone_t, zone_linkage));
1942 
1943         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
1944         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
1945         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
1946         zone0.zone_shares = 1;
1947         zone0.zone_nlwps = 0;
1948         zone0.zone_nlwps_ctl = INT_MAX;
1949         zone0.zone_nprocs = 0;
1950         zone0.zone_nprocs_ctl = INT_MAX;
1951         zone0.zone_locked_mem = 0;
1952         zone0.zone_locked_mem_ctl = UINT64_MAX;
1953         ASSERT(zone0.zone_max_swap == 0);
1954         zone0.zone_max_swap_ctl = UINT64_MAX;
1955         zone0.zone_max_lofi = 0;
1956         zone0.zone_max_lofi_ctl = UINT64_MAX;
1957         zone0.zone_shmmax = 0;
1958         zone0.zone_ipc.ipcq_shmmni = 0;
1959         zone0.zone_ipc.ipcq_semmni = 0;
1960         zone0.zone_ipc.ipcq_msgmni = 0;
1961         zone0.zone_name = GLOBAL_ZONENAME;
1962         zone0.zone_nodename = utsname.nodename;
1963         zone0.zone_domain = srpc_domain;
1964         zone0.zone_hostid = HW_INVALID_HOSTID;
1965         zone0.zone_fs_allowed = NULL;
1966         zone0.zone_ref = 1;
1967         zone0.zone_id = GLOBAL_ZONEID;
1968         zone0.zone_status = ZONE_IS_RUNNING;
1969         zone0.zone_rootpath = "/";
1970         zone0.zone_rootpathlen = 2;
1971         zone0.zone_psetid = ZONE_PS_INVAL;
1972         zone0.zone_ncpus = 0;
1973         zone0.zone_ncpus_online = 0;
1974         zone0.zone_proc_initpid = 1;
1975         zone0.zone_initname = initname;
1976         zone0.zone_lockedmem_kstat = NULL;
1977         zone0.zone_swapresv_kstat = NULL;
1978         zone0.zone_nprocs_kstat = NULL;
1979 
1980         zone0.zone_stime = 0;
1981         zone0.zone_utime = 0;
1982         zone0.zone_wtime = 0;
1983 
1984         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
1985             offsetof(zone_ref_t, zref_linkage));
1986         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
1987             offsetof(struct zsd_entry, zsd_linkage));
1988         list_insert_head(&zone_active, &zone0);
1989 
1990         /*
1991          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
1992          * to anything meaningful.  It is assigned to be 'rootdir' in
1993          * vfs_mountroot().
1994          */
1995         zone0.zone_rootvp = NULL;
1996         zone0.zone_vfslist = NULL;
1997         zone0.zone_bootargs = initargs;
1998         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
1999         /*
2000          * The global zone has all privileges
2001          */
2002         priv_fillset(zone0.zone_privset);
2003         /*
2004          * Add p0 to the global zone
2005          */
2006         zone0.zone_zsched = &p0;
2007         p0.p_zone = &zone0;
2008 }
2009 
2010 /*
2011  * Compute a hash value based on the contents of the label and the DOI.  The
2012  * hash algorithm is somewhat arbitrary, but is based on the observation that
2013  * humans will likely pick labels that differ by amounts that work out to be
2014  * multiples of the number of hash chains, and thus stirring in some primes
2015  * should help.
2016  */
2017 static uint_t
2018 hash_bylabel(void *hdata, mod_hash_key_t key)
2019 {
2020         const ts_label_t *lab = (ts_label_t *)key;
2021         const uint32_t *up, *ue;
2022         uint_t hash;
2023         int i;
2024 
2025         _NOTE(ARGUNUSED(hdata));
2026 
2027         hash = lab->tsl_doi + (lab->tsl_doi << 1);
2028         /* we depend on alignment of label, but not representation */
2029         up = (const uint32_t *)&lab->tsl_label;
2030         ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2031         i = 1;
2032         while (up < ue) {
2033                 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2034                 hash += *up + (*up << ((i % 16) + 1));
2035                 up++;
2036                 i++;
2037         }
2038         return (hash);
2039 }
2040 
2041 /*
2042  * All that mod_hash cares about here is zero (equal) versus non-zero (not
2043  * equal).  This may need to be changed if less than / greater than is ever
2044  * needed.
2045  */
2046 static int
2047 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2048 {
2049         ts_label_t *lab1 = (ts_label_t *)key1;
2050         ts_label_t *lab2 = (ts_label_t *)key2;
2051 
2052         return (label_equal(lab1, lab2) ? 0 : 1);
2053 }
2054 
2055 /*
2056  * Called by main() to initialize the zones framework.
2057  */
2058 void
2059 zone_init(void)
2060 {
2061         rctl_dict_entry_t *rde;
2062         rctl_val_t *dval;
2063         rctl_set_t *set;
2064         rctl_alloc_gp_t *gp;
2065         rctl_entity_p_t e;
2066         int res;
2067 
2068         ASSERT(curproc == &p0);
2069 
2070         /*
2071          * Create ID space for zone IDs.  ID 0 is reserved for the
2072          * global zone.
2073          */
2074         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2075 
2076         /*
2077          * Initialize generic zone resource controls, if any.
2078          */
2079         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2080             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2081             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2082             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2083 
2084         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2085             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2086             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2087             RCTL_GLOBAL_INFINITE,
2088             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2089 
2090         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2091             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2092             INT_MAX, INT_MAX, &zone_lwps_ops);
2093 
2094         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2095             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2096             INT_MAX, INT_MAX, &zone_procs_ops);
2097 
2098         /*
2099          * System V IPC resource controls
2100          */
2101         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2102             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2103             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2104 
2105         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2106             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2107             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2108 
2109         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2110             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2111             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2112 
2113         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2114             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2115             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2116 
2117         /*
2118          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2119          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2120          */
2121         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2122         bzero(dval, sizeof (rctl_val_t));
2123         dval->rcv_value = 1;
2124         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2125         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2126         dval->rcv_action_recip_pid = -1;
2127 
2128         rde = rctl_dict_lookup("zone.cpu-shares");
2129         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2130 
2131         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2132             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2133             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2134             &zone_locked_mem_ops);
2135 
2136         rc_zone_max_swap = rctl_register("zone.max-swap",
2137             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2138             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2139             &zone_max_swap_ops);
2140 
2141         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2142             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2143             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2144             &zone_max_lofi_ops);
2145 
2146         /*
2147          * Initialize the ``global zone''.
2148          */
2149         set = rctl_set_create();
2150         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2151         mutex_enter(&p0.p_lock);
2152         e.rcep_p.zone = &zone0;
2153         e.rcep_t = RCENTITY_ZONE;
2154         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2155             gp);
2156 
2157         zone0.zone_nlwps = p0.p_lwpcnt;
2158         zone0.zone_nprocs = 1;
2159         zone0.zone_ntasks = 1;
2160         mutex_exit(&p0.p_lock);
2161         zone0.zone_restart_init = B_TRUE;
2162         zone0.zone_brand = &native_brand;
2163         rctl_prealloc_destroy(gp);
2164         /*
2165          * pool_default hasn't been initialized yet, so we let pool_init()
2166          * take care of making sure the global zone is in the default pool.
2167          */
2168 
2169         /*
2170          * Initialize global zone kstats
2171          */
2172         zone_kstat_create(&zone0);
2173 
2174         /*
2175          * Initialize zone label.
2176          * mlp are initialized when tnzonecfg is loaded.
2177          */
2178         zone0.zone_slabel = l_admin_low;
2179         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2180         label_hold(l_admin_low);
2181 
2182         /*
2183          * Initialise the lock for the database structure used by mntfs.
2184          */
2185         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2186 
2187         mutex_enter(&zonehash_lock);
2188         zone_uniqid(&zone0);
2189         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2190 
2191         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2192             mod_hash_null_valdtor);
2193         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2194             zone_hash_size, mod_hash_null_valdtor);
2195         /*
2196          * maintain zonehashbylabel only for labeled systems
2197          */
2198         if (is_system_labeled())
2199                 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2200                     zone_hash_size, mod_hash_null_keydtor,
2201                     mod_hash_null_valdtor, hash_bylabel, NULL,
2202                     hash_labelkey_cmp, KM_SLEEP);
2203         zonecount = 1;
2204 
2205         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2206             (mod_hash_val_t)&zone0);
2207         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2208             (mod_hash_val_t)&zone0);
2209         if (is_system_labeled()) {
2210                 zone0.zone_flags |= ZF_HASHED_LABEL;
2211                 (void) mod_hash_insert(zonehashbylabel,
2212                     (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2213         }
2214         mutex_exit(&zonehash_lock);
2215 
2216         /*
2217          * We avoid setting zone_kcred until now, since kcred is initialized
2218          * sometime after zone_zsd_init() and before zone_init().
2219          */
2220         zone0.zone_kcred = kcred;
2221         /*
2222          * The global zone is fully initialized (except for zone_rootvp which
2223          * will be set when the root filesystem is mounted).
2224          */
2225         global_zone = &zone0;
2226 
2227         /*
2228          * Setup an event channel to send zone status change notifications on
2229          */
2230         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2231             EVCH_CREAT);
2232 
2233         if (res)
2234                 panic("Sysevent_evc_bind failed during zone setup.\n");
2235 
2236 }
2237 
2238 static void
2239 zone_free(zone_t *zone)
2240 {
2241         ASSERT(zone != global_zone);
2242         ASSERT(zone->zone_ntasks == 0);
2243         ASSERT(zone->zone_nlwps == 0);
2244         ASSERT(zone->zone_nprocs == 0);
2245         ASSERT(zone->zone_cred_ref == 0);
2246         ASSERT(zone->zone_kcred == NULL);
2247         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2248             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2249         ASSERT(list_is_empty(&zone->zone_ref_list));
2250 
2251         /*
2252          * Remove any zone caps.
2253          */
2254         cpucaps_zone_remove(zone);
2255 
2256         ASSERT(zone->zone_cpucap == NULL);
2257 
2258         /* remove from deathrow list */
2259         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2260                 ASSERT(zone->zone_ref == 0);
2261                 mutex_enter(&zone_deathrow_lock);
2262                 list_remove(&zone_deathrow, zone);
2263                 mutex_exit(&zone_deathrow_lock);
2264         }
2265 
2266         list_destroy(&zone->zone_ref_list);
2267         zone_free_zsd(zone);
2268         zone_free_datasets(zone);
2269         list_destroy(&zone->zone_dl_list);
2270 
2271         if (zone->zone_rootvp != NULL)
2272                 VN_RELE(zone->zone_rootvp);
2273         if (zone->zone_rootpath)
2274                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2275         if (zone->zone_name != NULL)
2276                 kmem_free(zone->zone_name, ZONENAME_MAX);
2277         if (zone->zone_slabel != NULL)
2278                 label_rele(zone->zone_slabel);
2279         if (zone->zone_nodename != NULL)
2280                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2281         if (zone->zone_domain != NULL)
2282                 kmem_free(zone->zone_domain, _SYS_NMLN);
2283         if (zone->zone_privset != NULL)
2284                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2285         if (zone->zone_rctls != NULL)
2286                 rctl_set_free(zone->zone_rctls);
2287         if (zone->zone_bootargs != NULL)
2288                 strfree(zone->zone_bootargs);
2289         if (zone->zone_initname != NULL)
2290                 strfree(zone->zone_initname);
2291         if (zone->zone_fs_allowed != NULL)
2292                 strfree(zone->zone_fs_allowed);
2293         if (zone->zone_pfexecd != NULL)
2294                 klpd_freelist(&zone->zone_pfexecd);
2295         id_free(zoneid_space, zone->zone_id);
2296         mutex_destroy(&zone->zone_lock);
2297         cv_destroy(&zone->zone_cv);
2298         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2299         rw_destroy(&zone->zone_mntfs_db_lock);
2300         kmem_free(zone, sizeof (zone_t));
2301 }
2302 
2303 /*
2304  * See block comment at the top of this file for information about zone
2305  * status values.
2306  */
2307 /*
2308  * Convenience function for setting zone status.
2309  */
2310 static void
2311 zone_status_set(zone_t *zone, zone_status_t status)
2312 {
2313 
2314         nvlist_t *nvl = NULL;
2315         ASSERT(MUTEX_HELD(&zone_status_lock));
2316         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2317             status >= zone_status_get(zone));
2318 
2319         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2320             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2321             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2322             zone_status_table[status]) ||
2323             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2324             zone_status_table[zone->zone_status]) ||
2325             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2326             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2327             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2328             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2329 #ifdef DEBUG
2330                 (void) printf(
2331                     "Failed to allocate and send zone state change event.\n");
2332 #endif
2333         }
2334         nvlist_free(nvl);
2335 
2336         zone->zone_status = status;
2337 
2338         cv_broadcast(&zone->zone_cv);
2339 }
2340 
2341 /*
2342  * Public function to retrieve the zone status.  The zone status may
2343  * change after it is retrieved.
2344  */
2345 zone_status_t
2346 zone_status_get(zone_t *zone)
2347 {
2348         return (zone->zone_status);
2349 }
2350 
2351 static int
2352 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2353 {
2354         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2355         int err = 0;
2356 
2357         ASSERT(zone != global_zone);
2358         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2359                 goto done;      /* EFAULT or ENAMETOOLONG */
2360 
2361         if (zone->zone_bootargs != NULL)
2362                 strfree(zone->zone_bootargs);
2363 
2364         zone->zone_bootargs = strdup(buf);
2365 
2366 done:
2367         kmem_free(buf, BOOTARGS_MAX);
2368         return (err);
2369 }
2370 
2371 static int
2372 zone_set_brand(zone_t *zone, const char *brand)
2373 {
2374         struct brand_attr *attrp;
2375         brand_t *bp;
2376 
2377         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2378         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2379                 kmem_free(attrp, sizeof (struct brand_attr));
2380                 return (EFAULT);
2381         }
2382 
2383         bp = brand_register_zone(attrp);
2384         kmem_free(attrp, sizeof (struct brand_attr));
2385         if (bp == NULL)
2386                 return (EINVAL);
2387 
2388         /*
2389          * This is the only place where a zone can change it's brand.
2390          * We already need to hold zone_status_lock to check the zone
2391          * status, so we'll just use that lock to serialize zone
2392          * branding requests as well.
2393          */
2394         mutex_enter(&zone_status_lock);
2395 
2396         /* Re-Branding is not allowed and the zone can't be booted yet */
2397         if ((ZONE_IS_BRANDED(zone)) ||
2398             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2399                 mutex_exit(&zone_status_lock);
2400                 brand_unregister_zone(bp);
2401                 return (EINVAL);
2402         }
2403 
2404         /* set up the brand specific data */
2405         zone->zone_brand = bp;
2406         ZBROP(zone)->b_init_brand_data(zone);
2407 
2408         mutex_exit(&zone_status_lock);
2409         return (0);
2410 }
2411 
2412 static int
2413 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2414 {
2415         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2416         int err = 0;
2417 
2418         ASSERT(zone != global_zone);
2419         if ((err = copyinstr(zone_fs_allowed, buf,
2420             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2421                 goto done;
2422 
2423         if (zone->zone_fs_allowed != NULL)
2424                 strfree(zone->zone_fs_allowed);
2425 
2426         zone->zone_fs_allowed = strdup(buf);
2427 
2428 done:
2429         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2430         return (err);
2431 }
2432 
2433 static int
2434 zone_set_initname(zone_t *zone, const char *zone_initname)
2435 {
2436         char initname[INITNAME_SZ];
2437         size_t len;
2438         int err = 0;
2439 
2440         ASSERT(zone != global_zone);
2441         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2442                 return (err);   /* EFAULT or ENAMETOOLONG */
2443 
2444         if (zone->zone_initname != NULL)
2445                 strfree(zone->zone_initname);
2446 
2447         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2448         (void) strcpy(zone->zone_initname, initname);
2449         return (0);
2450 }
2451 
2452 static int
2453 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2454 {
2455         uint64_t mcap;
2456         int err = 0;
2457 
2458         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2459                 zone->zone_phys_mcap = mcap;
2460 
2461         return (err);
2462 }
2463 
2464 static int
2465 zone_set_sched_class(zone_t *zone, const char *new_class)
2466 {
2467         char sched_class[PC_CLNMSZ];
2468         id_t classid;
2469         int err;
2470 
2471         ASSERT(zone != global_zone);
2472         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2473                 return (err);   /* EFAULT or ENAMETOOLONG */
2474 
2475         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2476                 return (set_errno(EINVAL));
2477         zone->zone_defaultcid = classid;
2478         ASSERT(zone->zone_defaultcid > 0 &&
2479             zone->zone_defaultcid < loaded_classes);
2480 
2481         return (0);
2482 }
2483 
2484 /*
2485  * Block indefinitely waiting for (zone_status >= status)
2486  */
2487 void
2488 zone_status_wait(zone_t *zone, zone_status_t status)
2489 {
2490         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2491 
2492         mutex_enter(&zone_status_lock);
2493         while (zone->zone_status < status) {
2494                 cv_wait(&zone->zone_cv, &zone_status_lock);
2495         }
2496         mutex_exit(&zone_status_lock);
2497 }
2498 
2499 /*
2500  * Private CPR-safe version of zone_status_wait().
2501  */
2502 static void
2503 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2504 {
2505         callb_cpr_t cprinfo;
2506 
2507         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2508 
2509         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2510             str);
2511         mutex_enter(&zone_status_lock);
2512         while (zone->zone_status < status) {
2513                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2514                 cv_wait(&zone->zone_cv, &zone_status_lock);
2515                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2516         }
2517         /*
2518          * zone_status_lock is implicitly released by the following.
2519          */
2520         CALLB_CPR_EXIT(&cprinfo);
2521 }
2522 
2523 /*
2524  * Block until zone enters requested state or signal is received.  Return (0)
2525  * if signaled, non-zero otherwise.
2526  */
2527 int
2528 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2529 {
2530         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2531 
2532         mutex_enter(&zone_status_lock);
2533         while (zone->zone_status < status) {
2534                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2535                         mutex_exit(&zone_status_lock);
2536                         return (0);
2537                 }
2538         }
2539         mutex_exit(&zone_status_lock);
2540         return (1);
2541 }
2542 
2543 /*
2544  * Block until the zone enters the requested state or the timeout expires,
2545  * whichever happens first.  Return (-1) if operation timed out, time remaining
2546  * otherwise.
2547  */
2548 clock_t
2549 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2550 {
2551         clock_t timeleft = 0;
2552 
2553         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2554 
2555         mutex_enter(&zone_status_lock);
2556         while (zone->zone_status < status && timeleft != -1) {
2557                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2558         }
2559         mutex_exit(&zone_status_lock);
2560         return (timeleft);
2561 }
2562 
2563 /*
2564  * Block until the zone enters the requested state, the current process is
2565  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2566  * operation timed out, 0 if signaled, time remaining otherwise.
2567  */
2568 clock_t
2569 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2570 {
2571         clock_t timeleft = tim - ddi_get_lbolt();
2572 
2573         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2574 
2575         mutex_enter(&zone_status_lock);
2576         while (zone->zone_status < status) {
2577                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2578                     tim);
2579                 if (timeleft <= 0)
2580                         break;
2581         }
2582         mutex_exit(&zone_status_lock);
2583         return (timeleft);
2584 }
2585 
2586 /*
2587  * Zones have two reference counts: one for references from credential
2588  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2589  * This is so we can allow a zone to be rebooted while there are still
2590  * outstanding cred references, since certain drivers cache dblks (which
2591  * implicitly results in cached creds).  We wait for zone_ref to drop to
2592  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2593  * later freed when the zone_cred_ref drops to 0, though nothing other
2594  * than the zone id and privilege set should be accessed once the zone
2595  * is "dead".
2596  *
2597  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2598  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2599  * to 0.  This can be useful to flush out other sources of cached creds
2600  * that may be less innocuous than the driver case.
2601  *
2602  * Zones also provide a tracked reference counting mechanism in which zone
2603  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2604  * debuggers determine the sources of leaked zone references.  See
2605  * zone_hold_ref() and zone_rele_ref() below for more information.
2606  */
2607 
2608 int zone_wait_for_cred = 0;
2609 
2610 static void
2611 zone_hold_locked(zone_t *z)
2612 {
2613         ASSERT(MUTEX_HELD(&z->zone_lock));
2614         z->zone_ref++;
2615         ASSERT(z->zone_ref != 0);
2616 }
2617 
2618 /*
2619  * Increment the specified zone's reference count.  The zone's zone_t structure
2620  * will not be freed as long as the zone's reference count is nonzero.
2621  * Decrement the zone's reference count via zone_rele().
2622  *
2623  * NOTE: This function should only be used to hold zones for short periods of
2624  * time.  Use zone_hold_ref() if the zone must be held for a long time.
2625  */
2626 void
2627 zone_hold(zone_t *z)
2628 {
2629         mutex_enter(&z->zone_lock);
2630         zone_hold_locked(z);
2631         mutex_exit(&z->zone_lock);
2632 }
2633 
2634 /*
2635  * If the non-cred ref count drops to 1 and either the cred ref count
2636  * is 0 or we aren't waiting for cred references, the zone is ready to
2637  * be destroyed.
2638  */
2639 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2640             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2641 
2642 /*
2643  * Common zone reference release function invoked by zone_rele() and
2644  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2645  * zone's subsystem-specific reference counters are not affected by the
2646  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2647  * removed from the specified zone's reference list.  ref must be non-NULL iff
2648  * subsys is not ZONE_REF_NUM_SUBSYS.
2649  */
2650 static void
2651 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2652 {
2653         boolean_t wakeup;
2654 
2655         mutex_enter(&z->zone_lock);
2656         ASSERT(z->zone_ref != 0);
2657         z->zone_ref--;
2658         if (subsys != ZONE_REF_NUM_SUBSYS) {
2659                 ASSERT(z->zone_subsys_ref[subsys] != 0);
2660                 z->zone_subsys_ref[subsys]--;
2661                 list_remove(&z->zone_ref_list, ref);
2662         }
2663         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2664                 /* no more refs, free the structure */
2665                 mutex_exit(&z->zone_lock);
2666                 zone_free(z);
2667                 return;
2668         }
2669         /* signal zone_destroy so the zone can finish halting */
2670         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2671         mutex_exit(&z->zone_lock);
2672 
2673         if (wakeup) {
2674                 /*
2675                  * Grabbing zonehash_lock here effectively synchronizes with
2676                  * zone_destroy() to avoid missed signals.
2677                  */
2678                 mutex_enter(&zonehash_lock);
2679                 cv_broadcast(&zone_destroy_cv);
2680                 mutex_exit(&zonehash_lock);
2681         }
2682 }
2683 
2684 /*
2685  * Decrement the specified zone's reference count.  The specified zone will
2686  * cease to exist after this function returns if the reference count drops to
2687  * zero.  This function should be paired with zone_hold().
2688  */
2689 void
2690 zone_rele(zone_t *z)
2691 {
2692         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2693 }
2694 
2695 /*
2696  * Initialize a zone reference structure.  This function must be invoked for
2697  * a reference structure before the structure is passed to zone_hold_ref().
2698  */
2699 void
2700 zone_init_ref(zone_ref_t *ref)
2701 {
2702         ref->zref_zone = NULL;
2703         list_link_init(&ref->zref_linkage);
2704 }
2705 
2706 /*
2707  * Acquire a reference to zone z.  The caller must specify the
2708  * zone_ref_subsys_t constant associated with its subsystem.  The specified
2709  * zone_ref_t structure will represent a reference to the specified zone.  Use
2710  * zone_rele_ref() to release the reference.
2711  *
2712  * The referenced zone_t structure will not be freed as long as the zone_t's
2713  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2714  * references.
2715  *
2716  * NOTE: The zone_ref_t structure must be initialized before it is used.
2717  * See zone_init_ref() above.
2718  */
2719 void
2720 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2721 {
2722         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2723 
2724         /*
2725          * Prevent consumers from reusing a reference structure before
2726          * releasing it.
2727          */
2728         VERIFY(ref->zref_zone == NULL);
2729 
2730         ref->zref_zone = z;
2731         mutex_enter(&z->zone_lock);
2732         zone_hold_locked(z);
2733         z->zone_subsys_ref[subsys]++;
2734         ASSERT(z->zone_subsys_ref[subsys] != 0);
2735         list_insert_head(&z->zone_ref_list, ref);
2736         mutex_exit(&z->zone_lock);
2737 }
2738 
2739 /*
2740  * Release the zone reference represented by the specified zone_ref_t.
2741  * The reference is invalid after it's released; however, the zone_ref_t
2742  * structure can be reused without having to invoke zone_init_ref().
2743  * subsys should be the same value that was passed to zone_hold_ref()
2744  * when the reference was acquired.
2745  */
2746 void
2747 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2748 {
2749         zone_rele_common(ref->zref_zone, ref, subsys);
2750 
2751         /*
2752          * Set the zone_ref_t's zref_zone field to NULL to generate panics
2753          * when consumers dereference the reference.  This helps us catch
2754          * consumers who use released references.  Furthermore, this lets
2755          * consumers reuse the zone_ref_t structure without having to
2756          * invoke zone_init_ref().
2757          */
2758         ref->zref_zone = NULL;
2759 }
2760 
2761 void
2762 zone_cred_hold(zone_t *z)
2763 {
2764         mutex_enter(&z->zone_lock);
2765         z->zone_cred_ref++;
2766         ASSERT(z->zone_cred_ref != 0);
2767         mutex_exit(&z->zone_lock);
2768 }
2769 
2770 void
2771 zone_cred_rele(zone_t *z)
2772 {
2773         boolean_t wakeup;
2774 
2775         mutex_enter(&z->zone_lock);
2776         ASSERT(z->zone_cred_ref != 0);
2777         z->zone_cred_ref--;
2778         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2779                 /* no more refs, free the structure */
2780                 mutex_exit(&z->zone_lock);
2781                 zone_free(z);
2782                 return;
2783         }
2784         /*
2785          * If zone_destroy is waiting for the cred references to drain
2786          * out, and they have, signal it.
2787          */
2788         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2789             zone_status_get(z) >= ZONE_IS_DEAD);
2790         mutex_exit(&z->zone_lock);
2791 
2792         if (wakeup) {
2793                 /*
2794                  * Grabbing zonehash_lock here effectively synchronizes with
2795                  * zone_destroy() to avoid missed signals.
2796                  */
2797                 mutex_enter(&zonehash_lock);
2798                 cv_broadcast(&zone_destroy_cv);
2799                 mutex_exit(&zonehash_lock);
2800         }
2801 }
2802 
2803 void
2804 zone_task_hold(zone_t *z)
2805 {
2806         mutex_enter(&z->zone_lock);
2807         z->zone_ntasks++;
2808         ASSERT(z->zone_ntasks != 0);
2809         mutex_exit(&z->zone_lock);
2810 }
2811 
2812 void
2813 zone_task_rele(zone_t *zone)
2814 {
2815         uint_t refcnt;
2816 
2817         mutex_enter(&zone->zone_lock);
2818         ASSERT(zone->zone_ntasks != 0);
2819         refcnt = --zone->zone_ntasks;
2820         if (refcnt > 1)      {       /* Common case */
2821                 mutex_exit(&zone->zone_lock);
2822                 return;
2823         }
2824         zone_hold_locked(zone); /* so we can use the zone_t later */
2825         mutex_exit(&zone->zone_lock);
2826         if (refcnt == 1) {
2827                 /*
2828                  * See if the zone is shutting down.
2829                  */
2830                 mutex_enter(&zone_status_lock);
2831                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2832                         goto out;
2833                 }
2834 
2835                 /*
2836                  * Make sure the ntasks didn't change since we
2837                  * dropped zone_lock.
2838                  */
2839                 mutex_enter(&zone->zone_lock);
2840                 if (refcnt != zone->zone_ntasks) {
2841                         mutex_exit(&zone->zone_lock);
2842                         goto out;
2843                 }
2844                 mutex_exit(&zone->zone_lock);
2845 
2846                 /*
2847                  * No more user processes in the zone.  The zone is empty.
2848                  */
2849                 zone_status_set(zone, ZONE_IS_EMPTY);
2850                 goto out;
2851         }
2852 
2853         ASSERT(refcnt == 0);
2854         /*
2855          * zsched has exited; the zone is dead.
2856          */
2857         zone->zone_zsched = NULL;            /* paranoia */
2858         mutex_enter(&zone_status_lock);
2859         zone_status_set(zone, ZONE_IS_DEAD);
2860 out:
2861         mutex_exit(&zone_status_lock);
2862         zone_rele(zone);
2863 }
2864 
2865 zoneid_t
2866 getzoneid(void)
2867 {
2868         return (curproc->p_zone->zone_id);
2869 }
2870 
2871 /*
2872  * Internal versions of zone_find_by_*().  These don't zone_hold() or
2873  * check the validity of a zone's state.
2874  */
2875 static zone_t *
2876 zone_find_all_by_id(zoneid_t zoneid)
2877 {
2878         mod_hash_val_t hv;
2879         zone_t *zone = NULL;
2880 
2881         ASSERT(MUTEX_HELD(&zonehash_lock));
2882 
2883         if (mod_hash_find(zonehashbyid,
2884             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2885                 zone = (zone_t *)hv;
2886         return (zone);
2887 }
2888 
2889 static zone_t *
2890 zone_find_all_by_label(const ts_label_t *label)
2891 {
2892         mod_hash_val_t hv;
2893         zone_t *zone = NULL;
2894 
2895         ASSERT(MUTEX_HELD(&zonehash_lock));
2896 
2897         /*
2898          * zonehashbylabel is not maintained for unlabeled systems
2899          */
2900         if (!is_system_labeled())
2901                 return (NULL);
2902         if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2903                 zone = (zone_t *)hv;
2904         return (zone);
2905 }
2906 
2907 static zone_t *
2908 zone_find_all_by_name(char *name)
2909 {
2910         mod_hash_val_t hv;
2911         zone_t *zone = NULL;
2912 
2913         ASSERT(MUTEX_HELD(&zonehash_lock));
2914 
2915         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2916                 zone = (zone_t *)hv;
2917         return (zone);
2918 }
2919 
2920 /*
2921  * Public interface for looking up a zone by zoneid.  Only returns the zone if
2922  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2923  * Caller must call zone_rele() once it is done with the zone.
2924  *
2925  * The zone may begin the zone_destroy() sequence immediately after this
2926  * function returns, but may be safely used until zone_rele() is called.
2927  */
2928 zone_t *
2929 zone_find_by_id(zoneid_t zoneid)
2930 {
2931         zone_t *zone;
2932         zone_status_t status;
2933 
2934         mutex_enter(&zonehash_lock);
2935         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2936                 mutex_exit(&zonehash_lock);
2937                 return (NULL);
2938         }
2939         status = zone_status_get(zone);
2940         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2941                 /*
2942                  * For all practical purposes the zone doesn't exist.
2943                  */
2944                 mutex_exit(&zonehash_lock);
2945                 return (NULL);
2946         }
2947         zone_hold(zone);
2948         mutex_exit(&zonehash_lock);
2949         return (zone);
2950 }
2951 
2952 /*
2953  * Similar to zone_find_by_id, but using zone label as the key.
2954  */
2955 zone_t *
2956 zone_find_by_label(const ts_label_t *label)
2957 {
2958         zone_t *zone;
2959         zone_status_t status;
2960 
2961         mutex_enter(&zonehash_lock);
2962         if ((zone = zone_find_all_by_label(label)) == NULL) {
2963                 mutex_exit(&zonehash_lock);
2964                 return (NULL);
2965         }
2966 
2967         status = zone_status_get(zone);
2968         if (status > ZONE_IS_DOWN) {
2969                 /*
2970                  * For all practical purposes the zone doesn't exist.
2971                  */
2972                 mutex_exit(&zonehash_lock);
2973                 return (NULL);
2974         }
2975         zone_hold(zone);
2976         mutex_exit(&zonehash_lock);
2977         return (zone);
2978 }
2979 
2980 /*
2981  * Similar to zone_find_by_id, but using zone name as the key.
2982  */
2983 zone_t *
2984 zone_find_by_name(char *name)
2985 {
2986         zone_t *zone;
2987         zone_status_t status;
2988 
2989         mutex_enter(&zonehash_lock);
2990         if ((zone = zone_find_all_by_name(name)) == NULL) {
2991                 mutex_exit(&zonehash_lock);
2992                 return (NULL);
2993         }
2994         status = zone_status_get(zone);
2995         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2996                 /*
2997                  * For all practical purposes the zone doesn't exist.
2998                  */
2999                 mutex_exit(&zonehash_lock);
3000                 return (NULL);
3001         }
3002         zone_hold(zone);
3003         mutex_exit(&zonehash_lock);
3004         return (zone);
3005 }
3006 
3007 /*
3008  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3009  * if there is a zone "foo" rooted at /foo/root, and the path argument
3010  * is "/foo/root/proc", it will return the held zone_t corresponding to
3011  * zone "foo".
3012  *
3013  * zone_find_by_path() always returns a non-NULL value, since at the
3014  * very least every path will be contained in the global zone.
3015  *
3016  * As with the other zone_find_by_*() functions, the caller is
3017  * responsible for zone_rele()ing the return value of this function.
3018  */
3019 zone_t *
3020 zone_find_by_path(const char *path)
3021 {
3022         zone_t *zone;
3023         zone_t *zret = NULL;
3024         zone_status_t status;
3025 
3026         if (path == NULL) {
3027                 /*
3028                  * Call from rootconf().
3029                  */
3030                 zone_hold(global_zone);
3031                 return (global_zone);
3032         }
3033         ASSERT(*path == '/');
3034         mutex_enter(&zonehash_lock);
3035         list_for_each(&zone_active, zone) {
3036                 if (ZONE_PATH_VISIBLE(path, zone))
3037                         zret = zone;
3038         }
3039         ASSERT(zret != NULL);
3040         status = zone_status_get(zret);
3041         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3042                 /*
3043                  * Zone practically doesn't exist.
3044                  */
3045                 zret = global_zone;
3046         }
3047         zone_hold(zret);
3048         mutex_exit(&zonehash_lock);
3049         return (zret);
3050 }
3051 
3052 /*
3053  * Public interface for updating per-zone load averages.  Called once per
3054  * second.
3055  *
3056  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3057  */
3058 void
3059 zone_loadavg_update()
3060 {
3061         zone_t *zp;
3062         zone_status_t status;
3063         struct loadavg_s *lavg;
3064         hrtime_t zone_total;
3065         int i;
3066         hrtime_t hr_avg;
3067         int nrun;
3068         static int64_t f[3] = { 135, 27, 9 };
3069         int64_t q, r;
3070 
3071         mutex_enter(&zonehash_lock);
3072         for (zp = list_head(&zone_active); zp != NULL;
3073             zp = list_next(&zone_active, zp)) {
3074                 mutex_enter(&zp->zone_lock);
3075 
3076                 /* Skip zones that are on the way down or not yet up */
3077                 status = zone_status_get(zp);
3078                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3079                         /* For all practical purposes the zone doesn't exist. */
3080                         mutex_exit(&zp->zone_lock);
3081                         continue;
3082                 }
3083 
3084                 /*
3085                  * Update the 10 second moving average data in zone_loadavg.
3086                  */
3087                 lavg = &zp->zone_loadavg;
3088 
3089                 zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3090                 scalehrtime(&zone_total);
3091 
3092                 /* The zone_total should always be increasing. */
3093                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3094                     zone_total - lavg->lg_total : 0;
3095                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3096                 /* lg_total holds the prev. 1 sec. total */
3097                 lavg->lg_total = zone_total;
3098 
3099                 /*
3100                  * To simplify the calculation, we don't calculate the load avg.
3101                  * until the zone has been up for at least 10 seconds and our
3102                  * moving average is thus full.
3103                  */
3104                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3105                         lavg->lg_len++;
3106                         mutex_exit(&zp->zone_lock);
3107                         continue;
3108                 }
3109 
3110                 /* Now calculate the 1min, 5min, 15 min load avg. */
3111                 hr_avg = 0;
3112                 for (i = 0; i < S_LOADAVG_SZ; i++)
3113                         hr_avg += lavg->lg_loads[i];
3114                 hr_avg = hr_avg / S_LOADAVG_SZ;
3115                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3116 
3117                 /* Compute load avg. See comment in calcloadavg() */
3118                 for (i = 0; i < 3; i++) {
3119                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3120                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3121                         zp->zone_hp_avenrun[i] +=
3122                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3123 
3124                         /* avenrun[] can only hold 31 bits of load avg. */
3125                         if (zp->zone_hp_avenrun[i] <
3126                             ((uint64_t)1<<(31+16-FSHIFT)))
3127                                 zp->zone_avenrun[i] = (int32_t)
3128                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3129                         else
3130                                 zp->zone_avenrun[i] = 0x7fffffff;
3131                 }
3132 
3133                 mutex_exit(&zp->zone_lock);
3134         }
3135         mutex_exit(&zonehash_lock);
3136 }
3137 
3138 /*
3139  * Get the number of cpus visible to this zone.  The system-wide global
3140  * 'ncpus' is returned if pools are disabled, the caller is in the
3141  * global zone, or a NULL zone argument is passed in.
3142  */
3143 int
3144 zone_ncpus_get(zone_t *zone)
3145 {
3146         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3147 
3148         return (myncpus != 0 ? myncpus : ncpus);
3149 }
3150 
3151 /*
3152  * Get the number of online cpus visible to this zone.  The system-wide
3153  * global 'ncpus_online' is returned if pools are disabled, the caller
3154  * is in the global zone, or a NULL zone argument is passed in.
3155  */
3156 int
3157 zone_ncpus_online_get(zone_t *zone)
3158 {
3159         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3160 
3161         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3162 }
3163 
3164 /*
3165  * Return the pool to which the zone is currently bound.
3166  */
3167 pool_t *
3168 zone_pool_get(zone_t *zone)
3169 {
3170         ASSERT(pool_lock_held());
3171 
3172         return (zone->zone_pool);
3173 }
3174 
3175 /*
3176  * Set the zone's pool pointer and update the zone's visibility to match
3177  * the resources in the new pool.
3178  */
3179 void
3180 zone_pool_set(zone_t *zone, pool_t *pool)
3181 {
3182         ASSERT(pool_lock_held());
3183         ASSERT(MUTEX_HELD(&cpu_lock));
3184 
3185         zone->zone_pool = pool;
3186         zone_pset_set(zone, pool->pool_pset->pset_id);
3187 }
3188 
3189 /*
3190  * Return the cached value of the id of the processor set to which the
3191  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3192  * facility is disabled.
3193  */
3194 psetid_t
3195 zone_pset_get(zone_t *zone)
3196 {
3197         ASSERT(MUTEX_HELD(&cpu_lock));
3198 
3199         return (zone->zone_psetid);
3200 }
3201 
3202 /*
3203  * Set the cached value of the id of the processor set to which the zone
3204  * is currently bound.  Also update the zone's visibility to match the
3205  * resources in the new processor set.
3206  */
3207 void
3208 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3209 {
3210         psetid_t oldpsetid;
3211 
3212         ASSERT(MUTEX_HELD(&cpu_lock));
3213         oldpsetid = zone_pset_get(zone);
3214 
3215         if (oldpsetid == newpsetid)
3216                 return;
3217         /*
3218          * Global zone sees all.
3219          */
3220         if (zone != global_zone) {
3221                 zone->zone_psetid = newpsetid;
3222                 if (newpsetid != ZONE_PS_INVAL)
3223                         pool_pset_visibility_add(newpsetid, zone);
3224                 if (oldpsetid != ZONE_PS_INVAL)
3225                         pool_pset_visibility_remove(oldpsetid, zone);
3226         }
3227         /*
3228          * Disabling pools, so we should start using the global values
3229          * for ncpus and ncpus_online.
3230          */
3231         if (newpsetid == ZONE_PS_INVAL) {
3232                 zone->zone_ncpus = 0;
3233                 zone->zone_ncpus_online = 0;
3234         }
3235 }
3236 
3237 /*
3238  * Walk the list of active zones and issue the provided callback for
3239  * each of them.
3240  *
3241  * Caller must not be holding any locks that may be acquired under
3242  * zonehash_lock.  See comment at the beginning of the file for a list of
3243  * common locks and their interactions with zones.
3244  */
3245 int
3246 zone_walk(int (*cb)(zone_t *, void *), void *data)
3247 {
3248         zone_t *zone;
3249         int ret = 0;
3250         zone_status_t status;
3251 
3252         mutex_enter(&zonehash_lock);
3253         list_for_each(&zone_active, zone) {
3254                 /*
3255                  * Skip zones that shouldn't be externally visible.
3256                  */
3257                 status = zone_status_get(zone);
3258                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3259                         continue;
3260                 /*
3261                  * Bail immediately if any callback invocation returns a
3262                  * non-zero value.
3263                  */
3264                 ret = (*cb)(zone, data);
3265                 if (ret != 0)
3266                         break;
3267         }
3268         mutex_exit(&zonehash_lock);
3269         return (ret);
3270 }
3271 
3272 static int
3273 zone_set_root(zone_t *zone, const char *upath)
3274 {
3275         vnode_t *vp;
3276         int trycount;
3277         int error = 0;
3278         char *path;
3279         struct pathname upn, pn;
3280         size_t pathlen;
3281 
3282         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3283                 return (error);
3284 
3285         pn_alloc(&pn);
3286 
3287         /* prevent infinite loop */
3288         trycount = 10;
3289         for (;;) {
3290                 if (--trycount <= 0) {
3291                         error = ESTALE;
3292                         goto out;
3293                 }
3294 
3295                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3296                         /*
3297                          * VOP_ACCESS() may cover 'vp' with a new
3298                          * filesystem, if 'vp' is an autoFS vnode.
3299                          * Get the new 'vp' if so.
3300                          */
3301                         if ((error =
3302                             VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3303                             (!vn_ismntpt(vp) ||
3304                             (error = traverse(&vp)) == 0)) {
3305                                 pathlen = pn.pn_pathlen + 2;
3306                                 path = kmem_alloc(pathlen, KM_SLEEP);
3307                                 (void) strncpy(path, pn.pn_path,
3308                                     pn.pn_pathlen + 1);
3309                                 path[pathlen - 2] = '/';
3310                                 path[pathlen - 1] = '\0';
3311                                 pn_free(&pn);
3312                                 pn_free(&upn);
3313 
3314                                 /* Success! */
3315                                 break;
3316                         }
3317                         VN_RELE(vp);
3318                 }
3319                 if (error != ESTALE)
3320                         goto out;
3321         }
3322 
3323         ASSERT(error == 0);
3324         zone->zone_rootvp = vp;              /* we hold a reference to vp */
3325         zone->zone_rootpath = path;
3326         zone->zone_rootpathlen = pathlen;
3327         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3328                 zone->zone_flags |= ZF_IS_SCRATCH;
3329         return (0);
3330 
3331 out:
3332         pn_free(&pn);
3333         pn_free(&upn);
3334         return (error);
3335 }
3336 
3337 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3338                         ((c) >= 'a' && (c) <= 'z') || \
3339                         ((c) >= 'A' && (c) <= 'Z'))
3340 
3341 static int
3342 zone_set_name(zone_t *zone, const char *uname)
3343 {
3344         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3345         size_t len;
3346         int i, err;
3347 
3348         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3349                 kmem_free(kname, ZONENAME_MAX);
3350                 return (err);   /* EFAULT or ENAMETOOLONG */
3351         }
3352 
3353         /* must be less than ZONENAME_MAX */
3354         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3355                 kmem_free(kname, ZONENAME_MAX);
3356                 return (EINVAL);
3357         }
3358 
3359         /*
3360          * Name must start with an alphanumeric and must contain only
3361          * alphanumerics, '-', '_' and '.'.
3362          */
3363         if (!isalnum(kname[0])) {
3364                 kmem_free(kname, ZONENAME_MAX);
3365                 return (EINVAL);
3366         }
3367         for (i = 1; i < len - 1; i++) {
3368                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3369                     kname[i] != '.') {
3370                         kmem_free(kname, ZONENAME_MAX);
3371                         return (EINVAL);
3372                 }
3373         }
3374 
3375         zone->zone_name = kname;
3376         return (0);
3377 }
3378 
3379 /*
3380  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3381  * is NULL or it points to a zone with no hostid emulation, then the machine's
3382  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3383  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3384  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3385  * hostid and the machine's hostid is invalid.
3386  */
3387 uint32_t
3388 zone_get_hostid(zone_t *zonep)
3389 {
3390         unsigned long machine_hostid;
3391 
3392         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3393                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3394                         return (HW_INVALID_HOSTID);
3395                 return ((uint32_t)machine_hostid);
3396         }
3397         return (zonep->zone_hostid);
3398 }
3399 
3400 /*
3401  * Similar to thread_create(), but makes sure the thread is in the appropriate
3402  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3403  */
3404 /*ARGSUSED*/
3405 kthread_t *
3406 zthread_create(
3407     caddr_t stk,
3408     size_t stksize,
3409     void (*proc)(),
3410     void *arg,
3411     size_t len,
3412     pri_t pri)
3413 {
3414         kthread_t *t;
3415         zone_t *zone = curproc->p_zone;
3416         proc_t *pp = zone->zone_zsched;
3417 
3418         zone_hold(zone);        /* Reference to be dropped when thread exits */
3419 
3420         /*
3421          * No-one should be trying to create threads if the zone is shutting
3422          * down and there aren't any kernel threads around.  See comment
3423          * in zthread_exit().
3424          */
3425         ASSERT(!(zone->zone_kthreads == NULL &&
3426             zone_status_get(zone) >= ZONE_IS_EMPTY));
3427         /*
3428          * Create a thread, but don't let it run until we've finished setting
3429          * things up.
3430          */
3431         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3432         ASSERT(t->t_forw == NULL);
3433         mutex_enter(&zone_status_lock);
3434         if (zone->zone_kthreads == NULL) {
3435                 t->t_forw = t->t_back = t;
3436         } else {
3437                 kthread_t *tx = zone->zone_kthreads;
3438 
3439                 t->t_forw = tx;
3440                 t->t_back = tx->t_back;
3441                 tx->t_back->t_forw = t;
3442                 tx->t_back = t;
3443         }
3444         zone->zone_kthreads = t;
3445         mutex_exit(&zone_status_lock);
3446 
3447         mutex_enter(&pp->p_lock);
3448         t->t_proc_flag |= TP_ZTHREAD;
3449         project_rele(t->t_proj);
3450         t->t_proj = project_hold(pp->p_task->tk_proj);
3451 
3452         /*
3453          * Setup complete, let it run.
3454          */
3455         thread_lock(t);
3456         t->t_schedflag |= TS_ALLSTART;
3457         setrun_locked(t);
3458         thread_unlock(t);
3459 
3460         mutex_exit(&pp->p_lock);
3461 
3462         return (t);
3463 }
3464 
3465 /*
3466  * Similar to thread_exit().  Must be called by threads created via
3467  * zthread_exit().
3468  */
3469 void
3470 zthread_exit(void)
3471 {
3472         kthread_t *t = curthread;
3473         proc_t *pp = curproc;
3474         zone_t *zone = pp->p_zone;
3475 
3476         mutex_enter(&zone_status_lock);
3477 
3478         /*
3479          * Reparent to p0
3480          */
3481         kpreempt_disable();
3482         mutex_enter(&pp->p_lock);
3483         t->t_proc_flag &= ~TP_ZTHREAD;
3484         t->t_procp = &p0;
3485         hat_thread_exit(t);
3486         mutex_exit(&pp->p_lock);
3487         kpreempt_enable();
3488 
3489         if (t->t_back == t) {
3490                 ASSERT(t->t_forw == t);
3491                 /*
3492                  * If the zone is empty, once the thread count
3493                  * goes to zero no further kernel threads can be
3494                  * created.  This is because if the creator is a process
3495                  * in the zone, then it must have exited before the zone
3496                  * state could be set to ZONE_IS_EMPTY.
3497                  * Otherwise, if the creator is a kernel thread in the
3498                  * zone, the thread count is non-zero.
3499                  *
3500                  * This really means that non-zone kernel threads should
3501                  * not create zone kernel threads.
3502                  */
3503                 zone->zone_kthreads = NULL;
3504                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3505                         zone_status_set(zone, ZONE_IS_DOWN);
3506                         /*
3507                          * Remove any CPU caps on this zone.
3508                          */
3509                         cpucaps_zone_remove(zone);
3510                 }
3511         } else {
3512                 t->t_forw->t_back = t->t_back;
3513                 t->t_back->t_forw = t->t_forw;
3514                 if (zone->zone_kthreads == t)
3515                         zone->zone_kthreads = t->t_forw;
3516         }
3517         mutex_exit(&zone_status_lock);
3518         zone_rele(zone);
3519         thread_exit();
3520         /* NOTREACHED */
3521 }
3522 
3523 static void
3524 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3525 {
3526         vnode_t *oldvp;
3527 
3528         /* we're going to hold a reference here to the directory */
3529         VN_HOLD(vp);
3530 
3531         /* update abs cwd/root path see c2/audit.c */
3532         if (AU_AUDITING())
3533                 audit_chdirec(vp, vpp);
3534 
3535         mutex_enter(&pp->p_lock);
3536         oldvp = *vpp;
3537         *vpp = vp;
3538         mutex_exit(&pp->p_lock);
3539         if (oldvp != NULL)
3540                 VN_RELE(oldvp);
3541 }
3542 
3543 /*
3544  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3545  */
3546 static int
3547 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3548 {
3549         nvpair_t *nvp = NULL;
3550         boolean_t priv_set = B_FALSE;
3551         boolean_t limit_set = B_FALSE;
3552         boolean_t action_set = B_FALSE;
3553 
3554         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3555                 const char *name;
3556                 uint64_t ui64;
3557 
3558                 name = nvpair_name(nvp);
3559                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3560                         return (EINVAL);
3561                 (void) nvpair_value_uint64(nvp, &ui64);
3562                 if (strcmp(name, "privilege") == 0) {
3563                         /*
3564                          * Currently only privileged values are allowed, but
3565                          * this may change in the future.
3566                          */
3567                         if (ui64 != RCPRIV_PRIVILEGED)
3568                                 return (EINVAL);
3569                         rv->rcv_privilege = ui64;
3570                         priv_set = B_TRUE;
3571                 } else if (strcmp(name, "limit") == 0) {
3572                         rv->rcv_value = ui64;
3573                         limit_set = B_TRUE;
3574                 } else if (strcmp(name, "action") == 0) {
3575                         if (ui64 != RCTL_LOCAL_NOACTION &&
3576                             ui64 != RCTL_LOCAL_DENY)
3577                                 return (EINVAL);
3578                         rv->rcv_flagaction = ui64;
3579                         action_set = B_TRUE;
3580                 } else {
3581                         return (EINVAL);
3582                 }
3583         }
3584 
3585         if (!(priv_set && limit_set && action_set))
3586                 return (EINVAL);
3587         rv->rcv_action_signal = 0;
3588         rv->rcv_action_recipient = NULL;
3589         rv->rcv_action_recip_pid = -1;
3590         rv->rcv_firing_time = 0;
3591 
3592         return (0);
3593 }
3594 
3595 /*
3596  * Non-global zone version of start_init.
3597  */
3598 void
3599 zone_start_init(void)
3600 {
3601         proc_t *p = ttoproc(curthread);
3602         zone_t *z = p->p_zone;
3603 
3604         ASSERT(!INGLOBALZONE(curproc));
3605 
3606         /*
3607          * For all purposes (ZONE_ATTR_INITPID and restart_init),
3608          * storing just the pid of init is sufficient.
3609          */
3610         z->zone_proc_initpid = p->p_pid;
3611 
3612         /*
3613          * We maintain zone_boot_err so that we can return the cause of the
3614          * failure back to the caller of the zone_boot syscall.
3615          */
3616         p->p_zone->zone_boot_err = start_init_common();
3617 
3618         /*
3619          * We will prevent booting zones from becoming running zones if the
3620          * global zone is shutting down.
3621          */
3622         mutex_enter(&zone_status_lock);
3623         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3624             ZONE_IS_SHUTTING_DOWN) {
3625                 /*
3626                  * Make sure we are still in the booting state-- we could have
3627                  * raced and already be shutting down, or even further along.
3628                  */
3629                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3630                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3631                 }
3632                 mutex_exit(&zone_status_lock);
3633                 /* It's gone bad, dispose of the process */
3634                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3635                         mutex_enter(&p->p_lock);
3636                         ASSERT(p->p_flag & SEXITLWPS);
3637                         lwp_exit();
3638                 }
3639         } else {
3640                 if (zone_status_get(z) == ZONE_IS_BOOTING)
3641                         zone_status_set(z, ZONE_IS_RUNNING);
3642                 mutex_exit(&zone_status_lock);
3643                 /* cause the process to return to userland. */
3644                 lwp_rtt();
3645         }
3646 }
3647 
3648 struct zsched_arg {
3649         zone_t *zone;
3650         nvlist_t *nvlist;
3651 };
3652 
3653 /*
3654  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3655  * anything to do with scheduling, but rather with the fact that
3656  * per-zone kernel threads are parented to zsched, just like regular
3657  * kernel threads are parented to sched (p0).
3658  *
3659  * zsched is also responsible for launching init for the zone.
3660  */
3661 static void
3662 zsched(void *arg)
3663 {
3664         struct zsched_arg *za = arg;
3665         proc_t *pp = curproc;
3666         proc_t *initp = proc_init;
3667         zone_t *zone = za->zone;
3668         cred_t *cr, *oldcred;
3669         rctl_set_t *set;
3670         rctl_alloc_gp_t *gp;
3671         contract_t *ct = NULL;
3672         task_t *tk, *oldtk;
3673         rctl_entity_p_t e;
3674         kproject_t *pj;
3675 
3676         nvlist_t *nvl = za->nvlist;
3677         nvpair_t *nvp = NULL;
3678 
3679         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3680         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3681         PTOU(pp)->u_argc = 0;
3682         PTOU(pp)->u_argv = NULL;
3683         PTOU(pp)->u_envp = NULL;
3684         closeall(P_FINFO(pp));
3685 
3686         /*
3687          * We are this zone's "zsched" process.  As the zone isn't generally
3688          * visible yet we don't need to grab any locks before initializing its
3689          * zone_proc pointer.
3690          */
3691         zone_hold(zone);  /* this hold is released by zone_destroy() */
3692         zone->zone_zsched = pp;
3693         mutex_enter(&pp->p_lock);
3694         pp->p_zone = zone;
3695         mutex_exit(&pp->p_lock);
3696 
3697         /*
3698          * Disassociate process from its 'parent'; parent ourselves to init
3699          * (pid 1) and change other values as needed.
3700          */
3701         sess_create();
3702 
3703         mutex_enter(&pidlock);
3704         proc_detach(pp);
3705         pp->p_ppid = 1;
3706         pp->p_flag |= SZONETOP;
3707         pp->p_ancpid = 1;
3708         pp->p_parent = initp;
3709         pp->p_psibling = NULL;
3710         if (initp->p_child)
3711                 initp->p_child->p_psibling = pp;
3712         pp->p_sibling = initp->p_child;
3713         initp->p_child = pp;
3714 
3715         /* Decrement what newproc() incremented. */
3716         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3717         /*
3718          * Our credentials are about to become kcred-like, so we don't care
3719          * about the caller's ruid.
3720          */
3721         upcount_inc(crgetruid(kcred), zone->zone_id);
3722         mutex_exit(&pidlock);
3723 
3724         /*
3725          * getting out of global zone, so decrement lwp and process counts
3726          */
3727         pj = pp->p_task->tk_proj;
3728         mutex_enter(&global_zone->zone_nlwps_lock);
3729         pj->kpj_nlwps -= pp->p_lwpcnt;
3730         global_zone->zone_nlwps -= pp->p_lwpcnt;
3731         pj->kpj_nprocs--;
3732         global_zone->zone_nprocs--;
3733         mutex_exit(&global_zone->zone_nlwps_lock);
3734 
3735         /*
3736          * Decrement locked memory counts on old zone and project.
3737          */
3738         mutex_enter(&global_zone->zone_mem_lock);
3739         global_zone->zone_locked_mem -= pp->p_locked_mem;
3740         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3741         mutex_exit(&global_zone->zone_mem_lock);
3742 
3743         /*
3744          * Create and join a new task in project '0' of this zone.
3745          *
3746          * We don't need to call holdlwps() since we know we're the only lwp in
3747          * this process.
3748          *
3749          * task_join() returns with p_lock held.
3750          */
3751         tk = task_create(0, zone);
3752         mutex_enter(&cpu_lock);
3753         oldtk = task_join(tk, 0);
3754 
3755         pj = pp->p_task->tk_proj;
3756 
3757         mutex_enter(&zone->zone_mem_lock);
3758         zone->zone_locked_mem += pp->p_locked_mem;
3759         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3760         mutex_exit(&zone->zone_mem_lock);
3761 
3762         /*
3763          * add lwp and process counts to zsched's zone, and increment
3764          * project's task and process count due to the task created in
3765          * the above task_create.
3766          */
3767         mutex_enter(&zone->zone_nlwps_lock);
3768         pj->kpj_nlwps += pp->p_lwpcnt;
3769         pj->kpj_ntasks += 1;
3770         zone->zone_nlwps += pp->p_lwpcnt;
3771         pj->kpj_nprocs++;
3772         zone->zone_nprocs++;
3773         mutex_exit(&zone->zone_nlwps_lock);
3774 
3775         mutex_exit(&curproc->p_lock);
3776         mutex_exit(&cpu_lock);
3777         task_rele(oldtk);
3778 
3779         /*
3780          * The process was created by a process in the global zone, hence the
3781          * credentials are wrong.  We might as well have kcred-ish credentials.
3782          */
3783         cr = zone->zone_kcred;
3784         crhold(cr);
3785         mutex_enter(&pp->p_crlock);
3786         oldcred = pp->p_cred;
3787         pp->p_cred = cr;
3788         mutex_exit(&pp->p_crlock);
3789         crfree(oldcred);
3790 
3791         /*
3792          * Hold credentials again (for thread)
3793          */
3794         crhold(cr);
3795 
3796         /*
3797          * p_lwpcnt can't change since this is a kernel process.
3798          */
3799         crset(pp, cr);
3800 
3801         /*
3802          * Chroot
3803          */
3804         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3805         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3806 
3807         /*
3808          * Initialize zone's rctl set.
3809          */
3810         set = rctl_set_create();
3811         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3812         mutex_enter(&pp->p_lock);
3813         e.rcep_p.zone = zone;
3814         e.rcep_t = RCENTITY_ZONE;
3815         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3816         mutex_exit(&pp->p_lock);
3817         rctl_prealloc_destroy(gp);
3818 
3819         /*
3820          * Apply the rctls passed in to zone_create().  This is basically a list
3821          * assignment: all of the old values are removed and the new ones
3822          * inserted.  That is, if an empty list is passed in, all values are
3823          * removed.
3824          */
3825         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3826                 rctl_dict_entry_t *rde;
3827                 rctl_hndl_t hndl;
3828                 char *name;
3829                 nvlist_t **nvlarray;
3830                 uint_t i, nelem;
3831                 int error;      /* For ASSERT()s */
3832 
3833                 name = nvpair_name(nvp);
3834                 hndl = rctl_hndl_lookup(name);
3835                 ASSERT(hndl != -1);
3836                 rde = rctl_dict_lookup_hndl(hndl);
3837                 ASSERT(rde != NULL);
3838 
3839                 for (; /* ever */; ) {
3840                         rctl_val_t oval;
3841 
3842                         mutex_enter(&pp->p_lock);
3843                         error = rctl_local_get(hndl, NULL, &oval, pp);
3844                         mutex_exit(&pp->p_lock);
3845                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3846                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3847                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
3848                                 break;
3849                         mutex_enter(&pp->p_lock);
3850                         error = rctl_local_delete(hndl, &oval, pp);
3851                         mutex_exit(&pp->p_lock);
3852                         ASSERT(error == 0);
3853                 }
3854                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3855                 ASSERT(error == 0);
3856                 for (i = 0; i < nelem; i++) {
3857                         rctl_val_t *nvalp;
3858 
3859                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3860                         error = nvlist2rctlval(nvlarray[i], nvalp);
3861                         ASSERT(error == 0);
3862                         /*
3863                          * rctl_local_insert can fail if the value being
3864                          * inserted is a duplicate; this is OK.
3865                          */
3866                         mutex_enter(&pp->p_lock);
3867                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
3868                                 kmem_cache_free(rctl_val_cache, nvalp);
3869                         mutex_exit(&pp->p_lock);
3870                 }
3871         }
3872         /*
3873          * Tell the world that we're done setting up.
3874          *
3875          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3876          * and atomically set the zone's processor set visibility.  Once
3877          * we drop pool_lock() this zone will automatically get updated
3878          * to reflect any future changes to the pools configuration.
3879          *
3880          * Note that after we drop the locks below (zonehash_lock in
3881          * particular) other operations such as a zone_getattr call can
3882          * now proceed and observe the zone. That is the reason for doing a
3883          * state transition to the INITIALIZED state.
3884          */
3885         pool_lock();
3886         mutex_enter(&cpu_lock);
3887         mutex_enter(&zonehash_lock);
3888         zone_uniqid(zone);
3889         zone_zsd_configure(zone);
3890         if (pool_state == POOL_ENABLED)
3891                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
3892         mutex_enter(&zone_status_lock);
3893         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
3894         zone_status_set(zone, ZONE_IS_INITIALIZED);
3895         mutex_exit(&zone_status_lock);
3896         mutex_exit(&zonehash_lock);
3897         mutex_exit(&cpu_lock);
3898         pool_unlock();
3899 
3900         /* Now call the create callback for this key */
3901         zsd_apply_all_keys(zsd_apply_create, zone);
3902 
3903         /* The callbacks are complete. Mark ZONE_IS_READY */
3904         mutex_enter(&zone_status_lock);
3905         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
3906         zone_status_set(zone, ZONE_IS_READY);
3907         mutex_exit(&zone_status_lock);
3908 
3909         /*
3910          * Once we see the zone transition to the ZONE_IS_BOOTING state,
3911          * we launch init, and set the state to running.
3912          */
3913         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
3914 
3915         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
3916                 id_t cid;
3917 
3918                 /*
3919                  * Ok, this is a little complicated.  We need to grab the
3920                  * zone's pool's scheduling class ID; note that by now, we
3921                  * are already bound to a pool if we need to be (zoneadmd
3922                  * will have done that to us while we're in the READY
3923                  * state).  *But* the scheduling class for the zone's 'init'
3924                  * must be explicitly passed to newproc, which doesn't
3925                  * respect pool bindings.
3926                  *
3927                  * We hold the pool_lock across the call to newproc() to
3928                  * close the obvious race: the pool's scheduling class
3929                  * could change before we manage to create the LWP with
3930                  * classid 'cid'.
3931                  */
3932                 pool_lock();
3933                 if (zone->zone_defaultcid > 0)
3934                         cid = zone->zone_defaultcid;
3935                 else
3936                         cid = pool_get_class(zone->zone_pool);
3937                 if (cid == -1)
3938                         cid = defaultcid;
3939 
3940                 /*
3941                  * If this fails, zone_boot will ultimately fail.  The
3942                  * state of the zone will be set to SHUTTING_DOWN-- userland
3943                  * will have to tear down the zone, and fail, or try again.
3944                  */
3945                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3946                     minclsyspri - 1, &ct, 0)) != 0) {
3947                         mutex_enter(&zone_status_lock);
3948                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3949                         mutex_exit(&zone_status_lock);
3950                 } else {
3951                         zone->zone_boot_time = gethrestime_sec();
3952                 }
3953 
3954                 pool_unlock();
3955         }
3956 
3957         /*
3958          * Wait for zone_destroy() to be called.  This is what we spend
3959          * most of our life doing.
3960          */
3961         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
3962 
3963         if (ct)
3964                 /*
3965                  * At this point the process contract should be empty.
3966                  * (Though if it isn't, it's not the end of the world.)
3967                  */
3968                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
3969 
3970         /*
3971          * Allow kcred to be freed when all referring processes
3972          * (including this one) go away.  We can't just do this in
3973          * zone_free because we need to wait for the zone_cred_ref to
3974          * drop to 0 before calling zone_free, and the existence of
3975          * zone_kcred will prevent that.  Thus, we call crfree here to
3976          * balance the crdup in zone_create.  The crhold calls earlier
3977          * in zsched will be dropped when the thread and process exit.
3978          */
3979         crfree(zone->zone_kcred);
3980         zone->zone_kcred = NULL;
3981 
3982         exit(CLD_EXITED, 0);
3983 }
3984 
3985 /*
3986  * Helper function to determine if there are any submounts of the
3987  * provided path.  Used to make sure the zone doesn't "inherit" any
3988  * mounts from before it is created.
3989  */
3990 static uint_t
3991 zone_mount_count(const char *rootpath)
3992 {
3993         vfs_t *vfsp;
3994         uint_t count = 0;
3995         size_t rootpathlen = strlen(rootpath);
3996 
3997         /*
3998          * Holding zonehash_lock prevents race conditions with
3999          * vfs_list_add()/vfs_list_remove() since we serialize with
4000          * zone_find_by_path().
4001          */
4002         ASSERT(MUTEX_HELD(&zonehash_lock));
4003         /*
4004          * The rootpath must end with a '/'
4005          */
4006         ASSERT(rootpath[rootpathlen - 1] == '/');
4007 
4008         /*
4009          * This intentionally does not count the rootpath itself if that
4010          * happens to be a mount point.
4011          */
4012         vfs_list_read_lock();
4013         vfsp = rootvfs;
4014         do {
4015                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4016                     rootpathlen) == 0)
4017                         count++;
4018                 vfsp = vfsp->vfs_next;
4019         } while (vfsp != rootvfs);
4020         vfs_list_unlock();
4021         return (count);
4022 }
4023 
4024 /*
4025  * Helper function to make sure that a zone created on 'rootpath'
4026  * wouldn't end up containing other zones' rootpaths.
4027  */
4028 static boolean_t
4029 zone_is_nested(const char *rootpath)
4030 {
4031         zone_t *zone;
4032         size_t rootpathlen = strlen(rootpath);
4033         size_t len;
4034 
4035         ASSERT(MUTEX_HELD(&zonehash_lock));
4036 
4037         /*
4038          * zone_set_root() appended '/' and '\0' at the end of rootpath
4039          */
4040         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4041             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4042                 return (B_TRUE);
4043 
4044         list_for_each(&zone_active, zone) {
4045                 if (zone == global_zone)
4046                         continue;
4047                 len = strlen(zone->zone_rootpath);
4048                 if (strncmp(rootpath, zone->zone_rootpath,
4049                     MIN(rootpathlen, len)) == 0)
4050                         return (B_TRUE);
4051         }
4052         return (B_FALSE);
4053 }
4054 
4055 static int
4056 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4057     size_t zone_privssz)
4058 {
4059         priv_set_t *privs;
4060 
4061         if (zone_privssz < sizeof (priv_set_t))
4062                 return (ENOMEM);
4063 
4064         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4065 
4066         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4067                 kmem_free(privs, sizeof (priv_set_t));
4068                 return (EFAULT);
4069         }
4070 
4071         zone->zone_privset = privs;
4072         return (0);
4073 }
4074 
4075 /*
4076  * We make creative use of nvlists to pass in rctls from userland.  The list is
4077  * a list of the following structures:
4078  *
4079  * (name = rctl_name, value = nvpair_list_array)
4080  *
4081  * Where each element of the nvpair_list_array is of the form:
4082  *
4083  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4084  *      (name = "limit", value = uint64_t),
4085  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4086  */
4087 static int
4088 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4089 {
4090         nvpair_t *nvp = NULL;
4091         nvlist_t *nvl = NULL;
4092         char *kbuf;
4093         int error;
4094         rctl_val_t rv;
4095 
4096         *nvlp = NULL;
4097 
4098         if (buflen == 0)
4099                 return (0);
4100 
4101         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4102                 return (ENOMEM);
4103         if (copyin(ubuf, kbuf, buflen)) {
4104                 error = EFAULT;
4105                 goto out;
4106         }
4107         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4108                 /*
4109                  * nvl may have been allocated/free'd, but the value set to
4110                  * non-NULL, so we reset it here.
4111                  */
4112                 nvl = NULL;
4113                 error = EINVAL;
4114                 goto out;
4115         }
4116         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4117                 rctl_dict_entry_t *rde;
4118                 rctl_hndl_t hndl;
4119                 nvlist_t **nvlarray;
4120                 uint_t i, nelem;
4121                 char *name;
4122 
4123                 error = EINVAL;
4124                 name = nvpair_name(nvp);
4125                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4126                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4127                         goto out;
4128                 }
4129                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4130                         goto out;
4131                 }
4132                 rde = rctl_dict_lookup_hndl(hndl);
4133                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4134                 ASSERT(error == 0);
4135                 for (i = 0; i < nelem; i++) {
4136                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4137                                 goto out;
4138                 }
4139                 if (rctl_invalid_value(rde, &rv)) {
4140                         error = EINVAL;
4141                         goto out;
4142                 }
4143         }
4144         error = 0;
4145         *nvlp = nvl;
4146 out:
4147         kmem_free(kbuf, buflen);
4148         if (error && nvl != NULL)
4149                 nvlist_free(nvl);
4150         return (error);
4151 }
4152 
4153 int
4154 zone_create_error(int er_error, int er_ext, int *er_out) {
4155         if (er_out != NULL) {
4156                 if (copyout(&er_ext, er_out, sizeof (int))) {
4157                         return (set_errno(EFAULT));
4158                 }
4159         }
4160         return (set_errno(er_error));
4161 }
4162 
4163 static int
4164 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4165 {
4166         ts_label_t *tsl;
4167         bslabel_t blab;
4168 
4169         /* Get label from user */
4170         if (copyin(lab, &blab, sizeof (blab)) != 0)
4171                 return (EFAULT);
4172         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4173         if (tsl == NULL)
4174                 return (ENOMEM);
4175 
4176         zone->zone_slabel = tsl;
4177         return (0);
4178 }
4179 
4180 /*
4181  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4182  */
4183 static int
4184 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4185 {
4186         char *kbuf;
4187         char *dataset, *next;
4188         zone_dataset_t *zd;
4189         size_t len;
4190 
4191         if (ubuf == NULL || buflen == 0)
4192                 return (0);
4193 
4194         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4195                 return (ENOMEM);
4196 
4197         if (copyin(ubuf, kbuf, buflen) != 0) {
4198                 kmem_free(kbuf, buflen);
4199                 return (EFAULT);
4200         }
4201 
4202         dataset = next = kbuf;
4203         for (;;) {
4204                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4205 
4206                 next = strchr(dataset, ',');
4207 
4208                 if (next == NULL)
4209                         len = strlen(dataset);
4210                 else
4211                         len = next - dataset;
4212 
4213                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4214                 bcopy(dataset, zd->zd_dataset, len);
4215                 zd->zd_dataset[len] = '\0';
4216 
4217                 list_insert_head(&zone->zone_datasets, zd);
4218 
4219                 if (next == NULL)
4220                         break;
4221 
4222                 dataset = next + 1;
4223         }
4224 
4225         kmem_free(kbuf, buflen);
4226         return (0);
4227 }
4228 
4229 /*
4230  * System call to create/initialize a new zone named 'zone_name', rooted
4231  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4232  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4233  * with labeling set by 'match', 'doi', and 'label'.
4234  *
4235  * If extended error is non-null, we may use it to return more detailed
4236  * error information.
4237  */
4238 static zoneid_t
4239 zone_create(const char *zone_name, const char *zone_root,
4240     const priv_set_t *zone_privs, size_t zone_privssz,
4241     caddr_t rctlbuf, size_t rctlbufsz,
4242     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4243     int match, uint32_t doi, const bslabel_t *label,
4244     int flags)
4245 {
4246         struct zsched_arg zarg;
4247         nvlist_t *rctls = NULL;
4248         proc_t *pp = curproc;
4249         zone_t *zone, *ztmp;
4250         zoneid_t zoneid;
4251         int error;
4252         int error2 = 0;
4253         char *str;
4254         cred_t *zkcr;
4255         boolean_t insert_label_hash;
4256 
4257         if (secpolicy_zone_config(CRED()) != 0)
4258                 return (set_errno(EPERM));
4259 
4260         /* can't boot zone from within chroot environment */
4261         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4262                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4263                     extended_error));
4264 
4265         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4266         zoneid = zone->zone_id = id_alloc(zoneid_space);
4267         zone->zone_status = ZONE_IS_UNINITIALIZED;
4268         zone->zone_pool = pool_default;
4269         zone->zone_pool_mod = gethrtime();
4270         zone->zone_psetid = ZONE_PS_INVAL;
4271         zone->zone_ncpus = 0;
4272         zone->zone_ncpus_online = 0;
4273         zone->zone_restart_init = B_TRUE;
4274         zone->zone_brand = &native_brand;
4275         zone->zone_initname = NULL;
4276         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4277         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4278         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4279         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4280         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4281             offsetof(zone_ref_t, zref_linkage));
4282         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4283             offsetof(struct zsd_entry, zsd_linkage));
4284         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4285             offsetof(zone_dataset_t, zd_linkage));
4286         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4287             offsetof(zone_dl_t, zdl_linkage));
4288         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4289         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4290 
4291         if (flags & ZCF_NET_EXCL) {
4292                 zone->zone_flags |= ZF_NET_EXCL;
4293         }
4294 
4295         if ((error = zone_set_name(zone, zone_name)) != 0) {
4296                 zone_free(zone);
4297                 return (zone_create_error(error, 0, extended_error));
4298         }
4299 
4300         if ((error = zone_set_root(zone, zone_root)) != 0) {
4301                 zone_free(zone);
4302                 return (zone_create_error(error, 0, extended_error));
4303         }
4304         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4305                 zone_free(zone);
4306                 return (zone_create_error(error, 0, extended_error));
4307         }
4308 
4309         /* initialize node name to be the same as zone name */
4310         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4311         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4312         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4313 
4314         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4315         zone->zone_domain[0] = '\0';
4316         zone->zone_hostid = HW_INVALID_HOSTID;
4317         zone->zone_shares = 1;
4318         zone->zone_shmmax = 0;
4319         zone->zone_ipc.ipcq_shmmni = 0;
4320         zone->zone_ipc.ipcq_semmni = 0;
4321         zone->zone_ipc.ipcq_msgmni = 0;
4322         zone->zone_bootargs = NULL;
4323         zone->zone_fs_allowed = NULL;
4324         zone->zone_initname =
4325             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4326         (void) strcpy(zone->zone_initname, zone_default_initname);
4327         zone->zone_nlwps = 0;
4328         zone->zone_nlwps_ctl = INT_MAX;
4329         zone->zone_nprocs = 0;
4330         zone->zone_nprocs_ctl = INT_MAX;
4331         zone->zone_locked_mem = 0;
4332         zone->zone_locked_mem_ctl = UINT64_MAX;
4333         zone->zone_max_swap = 0;
4334         zone->zone_max_swap_ctl = UINT64_MAX;
4335         zone->zone_max_lofi = 0;
4336         zone->zone_max_lofi_ctl = UINT64_MAX;
4337         zone0.zone_lockedmem_kstat = NULL;
4338         zone0.zone_swapresv_kstat = NULL;
4339 
4340         /*
4341          * Zsched initializes the rctls.
4342          */
4343         zone->zone_rctls = NULL;
4344 
4345         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4346                 zone_free(zone);
4347                 return (zone_create_error(error, 0, extended_error));
4348         }
4349 
4350         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4351                 zone_free(zone);
4352                 return (set_errno(error));
4353         }
4354 
4355         /*
4356          * Read in the trusted system parameters:
4357          * match flag and sensitivity label.
4358          */
4359         zone->zone_match = match;
4360         if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4361                 /* Fail if requested to set doi to anything but system's doi */
4362                 if (doi != 0 && doi != default_doi) {
4363                         zone_free(zone);
4364                         return (set_errno(EINVAL));
4365                 }
4366                 /* Always apply system's doi to the zone */
4367                 error = zone_set_label(zone, label, default_doi);
4368                 if (error != 0) {
4369                         zone_free(zone);
4370                         return (set_errno(error));
4371                 }
4372                 insert_label_hash = B_TRUE;
4373         } else {
4374                 /* all zones get an admin_low label if system is not labeled */
4375                 zone->zone_slabel = l_admin_low;
4376                 label_hold(l_admin_low);
4377                 insert_label_hash = B_FALSE;
4378         }
4379 
4380         /*
4381          * Stop all lwps since that's what normally happens as part of fork().
4382          * This needs to happen before we grab any locks to avoid deadlock
4383          * (another lwp in the process could be waiting for the held lock).
4384          */
4385         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4386                 zone_free(zone);
4387                 if (rctls)
4388                         nvlist_free(rctls);
4389                 return (zone_create_error(error, 0, extended_error));
4390         }
4391 
4392         if (block_mounts() == 0) {
4393                 mutex_enter(&pp->p_lock);
4394                 if (curthread != pp->p_agenttp)
4395                         continuelwps(pp);
4396                 mutex_exit(&pp->p_lock);
4397                 zone_free(zone);
4398                 if (rctls)
4399                         nvlist_free(rctls);
4400                 return (zone_create_error(error, 0, extended_error));
4401         }
4402 
4403         /*
4404          * Set up credential for kernel access.  After this, any errors
4405          * should go through the dance in errout rather than calling
4406          * zone_free directly.
4407          */
4408         zone->zone_kcred = crdup(kcred);
4409         crsetzone(zone->zone_kcred, zone);
4410         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4411         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4412         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4413         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4414 
4415         mutex_enter(&zonehash_lock);
4416         /*
4417          * Make sure zone doesn't already exist.
4418          *
4419          * If the system and zone are labeled,
4420          * make sure no other zone exists that has the same label.
4421          */
4422         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4423             (insert_label_hash &&
4424             (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4425                 zone_status_t status;
4426 
4427                 status = zone_status_get(ztmp);
4428                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4429                         error = EEXIST;
4430                 else
4431                         error = EBUSY;
4432 
4433                 if (insert_label_hash)
4434                         error2 = ZE_LABELINUSE;
4435 
4436                 goto errout;
4437         }
4438 
4439         /*
4440          * Don't allow zone creations which would cause one zone's rootpath to
4441          * be accessible from that of another (non-global) zone.
4442          */
4443         if (zone_is_nested(zone->zone_rootpath)) {
4444                 error = EBUSY;
4445                 goto errout;
4446         }
4447 
4448         ASSERT(zonecount != 0);         /* check for leaks */
4449         if (zonecount + 1 > maxzones) {
4450                 error = ENOMEM;
4451                 goto errout;
4452         }
4453 
4454         if (zone_mount_count(zone->zone_rootpath) != 0) {
4455                 error = EBUSY;
4456                 error2 = ZE_AREMOUNTS;
4457                 goto errout;
4458         }
4459 
4460         /*
4461          * Zone is still incomplete, but we need to drop all locks while
4462          * zsched() initializes this zone's kernel process.  We
4463          * optimistically add the zone to the hashtable and associated
4464          * lists so a parallel zone_create() doesn't try to create the
4465          * same zone.
4466          */
4467         zonecount++;
4468         (void) mod_hash_insert(zonehashbyid,
4469             (mod_hash_key_t)(uintptr_t)zone->zone_id,
4470             (mod_hash_val_t)(uintptr_t)zone);
4471         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4472         (void) strcpy(str, zone->zone_name);
4473         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4474             (mod_hash_val_t)(uintptr_t)zone);
4475         if (insert_label_hash) {
4476                 (void) mod_hash_insert(zonehashbylabel,
4477                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4478                 zone->zone_flags |= ZF_HASHED_LABEL;
4479         }
4480 
4481         /*
4482          * Insert into active list.  At this point there are no 'hold's
4483          * on the zone, but everyone else knows not to use it, so we can
4484          * continue to use it.  zsched() will do a zone_hold() if the
4485          * newproc() is successful.
4486          */
4487         list_insert_tail(&zone_active, zone);
4488         mutex_exit(&zonehash_lock);
4489 
4490         zarg.zone = zone;
4491         zarg.nvlist = rctls;
4492         /*
4493          * The process, task, and project rctls are probably wrong;
4494          * we need an interface to get the default values of all rctls,
4495          * and initialize zsched appropriately.  I'm not sure that that
4496          * makes much of a difference, though.
4497          */
4498         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4499         if (error != 0) {
4500                 /*
4501                  * We need to undo all globally visible state.
4502                  */
4503                 mutex_enter(&zonehash_lock);
4504                 list_remove(&zone_active, zone);
4505                 if (zone->zone_flags & ZF_HASHED_LABEL) {
4506                         ASSERT(zone->zone_slabel != NULL);
4507                         (void) mod_hash_destroy(zonehashbylabel,
4508                             (mod_hash_key_t)zone->zone_slabel);
4509                 }
4510                 (void) mod_hash_destroy(zonehashbyname,
4511                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
4512                 (void) mod_hash_destroy(zonehashbyid,
4513                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
4514                 ASSERT(zonecount > 1);
4515                 zonecount--;
4516                 goto errout;
4517         }
4518 
4519         /*
4520          * Zone creation can't fail from now on.
4521          */
4522 
4523         /*
4524          * Create zone kstats
4525          */
4526         zone_kstat_create(zone);
4527 
4528         /*
4529          * Let the other lwps continue.
4530          */
4531         mutex_enter(&pp->p_lock);
4532         if (curthread != pp->p_agenttp)
4533                 continuelwps(pp);
4534         mutex_exit(&pp->p_lock);
4535 
4536         /*
4537          * Wait for zsched to finish initializing the zone.
4538          */
4539         zone_status_wait(zone, ZONE_IS_READY);
4540         /*
4541          * The zone is fully visible, so we can let mounts progress.
4542          */
4543         resume_mounts();
4544         if (rctls)
4545                 nvlist_free(rctls);
4546 
4547         return (zoneid);
4548 
4549 errout:
4550         mutex_exit(&zonehash_lock);
4551         /*
4552          * Let the other lwps continue.
4553          */
4554         mutex_enter(&pp->p_lock);
4555         if (curthread != pp->p_agenttp)
4556                 continuelwps(pp);
4557         mutex_exit(&pp->p_lock);
4558 
4559         resume_mounts();
4560         if (rctls)
4561                 nvlist_free(rctls);
4562         /*
4563          * There is currently one reference to the zone, a cred_ref from
4564          * zone_kcred.  To free the zone, we call crfree, which will call
4565          * zone_cred_rele, which will call zone_free.
4566          */
4567         ASSERT(zone->zone_cred_ref == 1);
4568         ASSERT(zone->zone_kcred->cr_ref == 1);
4569         ASSERT(zone->zone_ref == 0);
4570         zkcr = zone->zone_kcred;
4571         zone->zone_kcred = NULL;
4572         crfree(zkcr);                           /* triggers call to zone_free */
4573         return (zone_create_error(error, error2, extended_error));
4574 }
4575 
4576 /*
4577  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4578  * the heavy lifting.  initname is the path to the program to launch
4579  * at the "top" of the zone; if this is NULL, we use the system default,
4580  * which is stored at zone_default_initname.
4581  */
4582 static int
4583 zone_boot(zoneid_t zoneid)
4584 {
4585         int err;
4586         zone_t *zone;
4587 
4588         if (secpolicy_zone_config(CRED()) != 0)
4589                 return (set_errno(EPERM));
4590         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4591                 return (set_errno(EINVAL));
4592 
4593         mutex_enter(&zonehash_lock);
4594         /*
4595          * Look for zone under hash lock to prevent races with calls to
4596          * zone_shutdown, zone_destroy, etc.
4597          */
4598         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4599                 mutex_exit(&zonehash_lock);
4600                 return (set_errno(EINVAL));
4601         }
4602 
4603         mutex_enter(&zone_status_lock);
4604         if (zone_status_get(zone) != ZONE_IS_READY) {
4605                 mutex_exit(&zone_status_lock);
4606                 mutex_exit(&zonehash_lock);
4607                 return (set_errno(EINVAL));
4608         }
4609         zone_status_set(zone, ZONE_IS_BOOTING);
4610         mutex_exit(&zone_status_lock);
4611 
4612         zone_hold(zone);        /* so we can use the zone_t later */
4613         mutex_exit(&zonehash_lock);
4614 
4615         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4616                 zone_rele(zone);
4617                 return (set_errno(EINTR));
4618         }
4619 
4620         /*
4621          * Boot (starting init) might have failed, in which case the zone
4622          * will go to the SHUTTING_DOWN state; an appropriate errno will
4623          * be placed in zone->zone_boot_err, and so we return that.
4624          */
4625         err = zone->zone_boot_err;
4626         zone_rele(zone);
4627         return (err ? set_errno(err) : 0);
4628 }
4629 
4630 /*
4631  * Kills all user processes in the zone, waiting for them all to exit
4632  * before returning.
4633  */
4634 static int
4635 zone_empty(zone_t *zone)
4636 {
4637         int waitstatus;
4638 
4639         /*
4640          * We need to drop zonehash_lock before killing all
4641          * processes, otherwise we'll deadlock with zone_find_*
4642          * which can be called from the exit path.
4643          */
4644         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4645         while ((waitstatus = zone_status_timedwait_sig(zone,
4646             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4647                 killall(zone->zone_id);
4648         }
4649         /*
4650          * return EINTR if we were signaled
4651          */
4652         if (waitstatus == 0)
4653                 return (EINTR);
4654         return (0);
4655 }
4656 
4657 /*
4658  * This function implements the policy for zone visibility.
4659  *
4660  * In standard Solaris, a non-global zone can only see itself.
4661  *
4662  * In Trusted Extensions, a labeled zone can lookup any zone whose label
4663  * it dominates. For this test, the label of the global zone is treated as
4664  * admin_high so it is special-cased instead of being checked for dominance.
4665  *
4666  * Returns true if zone attributes are viewable, false otherwise.
4667  */
4668 static boolean_t
4669 zone_list_access(zone_t *zone)
4670 {
4671 
4672         if (curproc->p_zone == global_zone ||
4673             curproc->p_zone == zone) {
4674                 return (B_TRUE);
4675         } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4676                 bslabel_t *curproc_label;
4677                 bslabel_t *zone_label;
4678 
4679                 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4680                 zone_label = label2bslabel(zone->zone_slabel);
4681 
4682                 if (zone->zone_id != GLOBAL_ZONEID &&
4683                     bldominates(curproc_label, zone_label)) {
4684                         return (B_TRUE);
4685                 } else {
4686                         return (B_FALSE);
4687                 }
4688         } else {
4689                 return (B_FALSE);
4690         }
4691 }
4692 
4693 /*
4694  * Systemcall to start the zone's halt sequence.  By the time this
4695  * function successfully returns, all user processes and kernel threads
4696  * executing in it will have exited, ZSD shutdown callbacks executed,
4697  * and the zone status set to ZONE_IS_DOWN.
4698  *
4699  * It is possible that the call will interrupt itself if the caller is the
4700  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4701  */
4702 static int
4703 zone_shutdown(zoneid_t zoneid)
4704 {
4705         int error;
4706         zone_t *zone;
4707         zone_status_t status;
4708 
4709         if (secpolicy_zone_config(CRED()) != 0)
4710                 return (set_errno(EPERM));
4711         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4712                 return (set_errno(EINVAL));
4713 
4714         /*
4715          * Block mounts so that VFS_MOUNT() can get an accurate view of
4716          * the zone's status with regards to ZONE_IS_SHUTTING down.
4717          *
4718          * e.g. NFS can fail the mount if it determines that the zone
4719          * has already begun the shutdown sequence.
4720          */
4721         if (block_mounts() == 0)
4722                 return (set_errno(EINTR));
4723         mutex_enter(&zonehash_lock);
4724         /*
4725          * Look for zone under hash lock to prevent races with other
4726          * calls to zone_shutdown and zone_destroy.
4727          */
4728         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4729                 mutex_exit(&zonehash_lock);
4730                 resume_mounts();
4731                 return (set_errno(EINVAL));
4732         }
4733         mutex_enter(&zone_status_lock);
4734         status = zone_status_get(zone);
4735         /*
4736          * Fail if the zone isn't fully initialized yet.
4737          */
4738         if (status < ZONE_IS_READY) {
4739                 mutex_exit(&zone_status_lock);
4740                 mutex_exit(&zonehash_lock);
4741                 resume_mounts();
4742                 return (set_errno(EINVAL));
4743         }
4744         /*
4745          * If conditions required for zone_shutdown() to return have been met,
4746          * return success.
4747          */
4748         if (status >= ZONE_IS_DOWN) {
4749                 mutex_exit(&zone_status_lock);
4750                 mutex_exit(&zonehash_lock);
4751                 resume_mounts();
4752                 return (0);
4753         }
4754         /*
4755          * If zone_shutdown() hasn't been called before, go through the motions.
4756          * If it has, there's nothing to do but wait for the kernel threads to
4757          * drain.
4758          */
4759         if (status < ZONE_IS_EMPTY) {
4760                 uint_t ntasks;
4761 
4762                 mutex_enter(&zone->zone_lock);
4763                 if ((ntasks = zone->zone_ntasks) != 1) {
4764                         /*
4765                          * There's still stuff running.
4766                          */
4767                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4768                 }
4769                 mutex_exit(&zone->zone_lock);
4770                 if (ntasks == 1) {
4771                         /*
4772                          * The only way to create another task is through
4773                          * zone_enter(), which will block until we drop
4774                          * zonehash_lock.  The zone is empty.
4775                          */
4776                         if (zone->zone_kthreads == NULL) {
4777                                 /*
4778                                  * Skip ahead to ZONE_IS_DOWN
4779                                  */
4780                                 zone_status_set(zone, ZONE_IS_DOWN);
4781                         } else {
4782                                 zone_status_set(zone, ZONE_IS_EMPTY);
4783                         }
4784                 }
4785         }
4786         zone_hold(zone);        /* so we can use the zone_t later */
4787         mutex_exit(&zone_status_lock);
4788         mutex_exit(&zonehash_lock);
4789         resume_mounts();
4790 
4791         if (error = zone_empty(zone)) {
4792                 zone_rele(zone);
4793                 return (set_errno(error));
4794         }
4795         /*
4796          * After the zone status goes to ZONE_IS_DOWN this zone will no
4797          * longer be notified of changes to the pools configuration, so
4798          * in order to not end up with a stale pool pointer, we point
4799          * ourselves at the default pool and remove all resource
4800          * visibility.  This is especially important as the zone_t may
4801          * languish on the deathrow for a very long time waiting for
4802          * cred's to drain out.
4803          *
4804          * This rebinding of the zone can happen multiple times
4805          * (presumably due to interrupted or parallel systemcalls)
4806          * without any adverse effects.
4807          */
4808         if (pool_lock_intr() != 0) {
4809                 zone_rele(zone);
4810                 return (set_errno(EINTR));
4811         }
4812         if (pool_state == POOL_ENABLED) {
4813                 mutex_enter(&cpu_lock);
4814                 zone_pool_set(zone, pool_default);
4815                 /*
4816                  * The zone no longer needs to be able to see any cpus.
4817                  */
4818                 zone_pset_set(zone, ZONE_PS_INVAL);
4819                 mutex_exit(&cpu_lock);
4820         }
4821         pool_unlock();
4822 
4823         /*
4824          * ZSD shutdown callbacks can be executed multiple times, hence
4825          * it is safe to not be holding any locks across this call.
4826          */
4827         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4828 
4829         mutex_enter(&zone_status_lock);
4830         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4831                 zone_status_set(zone, ZONE_IS_DOWN);
4832         mutex_exit(&zone_status_lock);
4833 
4834         /*
4835          * Wait for kernel threads to drain.
4836          */
4837         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4838                 zone_rele(zone);
4839                 return (set_errno(EINTR));
4840         }
4841 
4842         /*
4843          * Zone can be become down/destroyable even if the above wait
4844          * returns EINTR, so any code added here may never execute.
4845          * (i.e. don't add code here)
4846          */
4847 
4848         zone_rele(zone);
4849         return (0);
4850 }
4851 
4852 /*
4853  * Log the specified zone's reference counts.  The caller should not be
4854  * holding the zone's zone_lock.
4855  */
4856 static void
4857 zone_log_refcounts(zone_t *zone)
4858 {
4859         char *buffer;
4860         char *buffer_position;
4861         uint32_t buffer_size;
4862         uint32_t index;
4863         uint_t ref;
4864         uint_t cred_ref;
4865 
4866         /*
4867          * Construct a string representing the subsystem-specific reference
4868          * counts.  The counts are printed in ascending order by index into the
4869          * zone_t::zone_subsys_ref array.  The list will be surrounded by
4870          * square brackets [] and will only contain nonzero reference counts.
4871          *
4872          * The buffer will hold two square bracket characters plus ten digits,
4873          * one colon, one space, one comma, and some characters for a
4874          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
4875          * bit integers have at most ten decimal digits.)  The last
4876          * reference count's comma is replaced by the closing square
4877          * bracket and a NULL character to terminate the string.
4878          *
4879          * NOTE: We have to grab the zone's zone_lock to create a consistent
4880          * snapshot of the zone's reference counters.
4881          *
4882          * First, figure out how much space the string buffer will need.
4883          * The buffer's size is stored in buffer_size.
4884          */
4885         buffer_size = 2;                        /* for the square brackets */
4886         mutex_enter(&zone->zone_lock);
4887         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
4888         ref = zone->zone_ref;
4889         cred_ref = zone->zone_cred_ref;
4890         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
4891                 if (zone->zone_subsys_ref[index] != 0)
4892                         buffer_size += strlen(zone_ref_subsys_names[index]) +
4893                             13;
4894         if (buffer_size == 2) {
4895                 /*
4896                  * No subsystems had nonzero reference counts.  Don't bother
4897                  * with allocating a buffer; just log the general-purpose and
4898                  * credential reference counts.
4899                  */
4900                 mutex_exit(&zone->zone_lock);
4901                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4902                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
4903                     "references and %u credential references are still extant",
4904                     zone->zone_name, zone->zone_id, ref, cred_ref);
4905                 return;
4906         }
4907 
4908         /*
4909          * buffer_size contains the exact number of characters that the
4910          * buffer will need.  Allocate the buffer and fill it with nonzero
4911          * subsystem-specific reference counts.  Surround the results with
4912          * square brackets afterwards.
4913          */
4914         buffer = kmem_alloc(buffer_size, KM_SLEEP);
4915         buffer_position = &buffer[1];
4916         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
4917                 /*
4918                  * NOTE: The DDI's version of sprintf() returns a pointer to
4919                  * the modified buffer rather than the number of bytes written
4920                  * (as in snprintf(3C)).  This is unfortunate and annoying.
4921                  * Therefore, we'll use snprintf() with INT_MAX to get the
4922                  * number of bytes written.  Using INT_MAX is safe because
4923                  * the buffer is perfectly sized for the data: we'll never
4924                  * overrun the buffer.
4925                  */
4926                 if (zone->zone_subsys_ref[index] != 0)
4927                         buffer_position += snprintf(buffer_position, INT_MAX,
4928                             "%s: %u,", zone_ref_subsys_names[index],
4929                             zone->zone_subsys_ref[index]);
4930         }
4931         mutex_exit(&zone->zone_lock);
4932         buffer[0] = '[';
4933         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
4934         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
4935         buffer_position[-1] = ']';
4936 
4937         /*
4938          * Log the reference counts and free the message buffer.
4939          */
4940         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4941             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
4942             "%u credential references are still extant %s", zone->zone_name,
4943             zone->zone_id, ref, cred_ref, buffer);
4944         kmem_free(buffer, buffer_size);
4945 }
4946 
4947 /*
4948  * Systemcall entry point to finalize the zone halt process.  The caller
4949  * must have already successfully called zone_shutdown().
4950  *
4951  * Upon successful completion, the zone will have been fully destroyed:
4952  * zsched will have exited, destructor callbacks executed, and the zone
4953  * removed from the list of active zones.
4954  */
4955 static int
4956 zone_destroy(zoneid_t zoneid)
4957 {
4958         uint64_t uniqid;
4959         zone_t *zone;
4960         zone_status_t status;
4961         clock_t wait_time;
4962         boolean_t log_refcounts;
4963 
4964         if (secpolicy_zone_config(CRED()) != 0)
4965                 return (set_errno(EPERM));
4966         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4967                 return (set_errno(EINVAL));
4968 
4969         mutex_enter(&zonehash_lock);
4970         /*
4971          * Look for zone under hash lock to prevent races with other
4972          * calls to zone_destroy.
4973          */
4974         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4975                 mutex_exit(&zonehash_lock);
4976                 return (set_errno(EINVAL));
4977         }
4978 
4979         if (zone_mount_count(zone->zone_rootpath) != 0) {
4980                 mutex_exit(&zonehash_lock);
4981                 return (set_errno(EBUSY));
4982         }
4983         mutex_enter(&zone_status_lock);
4984         status = zone_status_get(zone);
4985         if (status < ZONE_IS_DOWN) {
4986                 mutex_exit(&zone_status_lock);
4987                 mutex_exit(&zonehash_lock);
4988                 return (set_errno(EBUSY));
4989         } else if (status == ZONE_IS_DOWN) {
4990                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
4991         }
4992         mutex_exit(&zone_status_lock);
4993         zone_hold(zone);
4994         mutex_exit(&zonehash_lock);
4995 
4996         /*
4997          * wait for zsched to exit
4998          */
4999         zone_status_wait(zone, ZONE_IS_DEAD);
5000         zone_zsd_callbacks(zone, ZSD_DESTROY);
5001         zone->zone_netstack = NULL;
5002         uniqid = zone->zone_uniqid;
5003         zone_rele(zone);
5004         zone = NULL;    /* potentially free'd */
5005 
5006         log_refcounts = B_FALSE;
5007         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5008         mutex_enter(&zonehash_lock);
5009         for (; /* ever */; ) {
5010                 boolean_t unref;
5011                 boolean_t refs_have_been_logged;
5012 
5013                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5014                     zone->zone_uniqid != uniqid) {
5015                         /*
5016                          * The zone has gone away.  Necessary conditions
5017                          * are met, so we return success.
5018                          */
5019                         mutex_exit(&zonehash_lock);
5020                         return (0);
5021                 }
5022                 mutex_enter(&zone->zone_lock);
5023                 unref = ZONE_IS_UNREF(zone);
5024                 refs_have_been_logged = (zone->zone_flags &
5025                     ZF_REFCOUNTS_LOGGED);
5026                 mutex_exit(&zone->zone_lock);
5027                 if (unref) {
5028                         /*
5029                          * There is only one reference to the zone -- that
5030                          * added when the zone was added to the hashtables --
5031                          * and things will remain this way until we drop
5032                          * zonehash_lock... we can go ahead and cleanup the
5033                          * zone.
5034                          */
5035                         break;
5036                 }
5037 
5038                 /*
5039                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5040                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5041                  * some zone's general-purpose reference count reaches one.
5042                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5043                  * on zone_destroy_cv, then log the zone's reference counts and
5044                  * continue to wait for zone_rele() and zone_cred_rele().
5045                  */
5046                 if (!refs_have_been_logged) {
5047                         if (!log_refcounts) {
5048                                 /*
5049                                  * This thread hasn't timed out waiting on
5050                                  * zone_destroy_cv yet.  Wait wait_time clock
5051                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5052                                  * seconds) for the zone's references to clear.
5053                                  */
5054                                 ASSERT(wait_time > 0);
5055                                 wait_time = cv_reltimedwait_sig(
5056                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5057                                     TR_SEC);
5058                                 if (wait_time > 0) {
5059                                         /*
5060                                          * A thread in zone_rele() or
5061                                          * zone_cred_rele() signaled
5062                                          * zone_destroy_cv before this thread's
5063                                          * wait timed out.  The zone might have
5064                                          * only one reference left; find out!
5065                                          */
5066                                         continue;
5067                                 } else if (wait_time == 0) {
5068                                         /* The thread's process was signaled. */
5069                                         mutex_exit(&zonehash_lock);
5070                                         return (set_errno(EINTR));
5071                                 }
5072 
5073                                 /*
5074                                  * The thread timed out while waiting on
5075                                  * zone_destroy_cv.  Even though the thread
5076                                  * timed out, it has to check whether another
5077                                  * thread woke up from zone_destroy_cv and
5078                                  * destroyed the zone.
5079                                  *
5080                                  * If the zone still exists and has more than
5081                                  * one unreleased general-purpose reference,
5082                                  * then log the zone's reference counts.
5083                                  */
5084                                 log_refcounts = B_TRUE;
5085                                 continue;
5086                         }
5087 
5088                         /*
5089                          * The thread already timed out on zone_destroy_cv while
5090                          * waiting for subsystems to release the zone's last
5091                          * general-purpose references.  Log the zone's reference
5092                          * counts and wait indefinitely on zone_destroy_cv.
5093                          */
5094                         zone_log_refcounts(zone);
5095                 }
5096                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5097                         /* The thread's process was signaled. */
5098                         mutex_exit(&zonehash_lock);
5099                         return (set_errno(EINTR));
5100                 }
5101         }
5102 
5103         /*
5104          * Remove CPU cap for this zone now since we're not going to
5105          * fail below this point.
5106          */
5107         cpucaps_zone_remove(zone);
5108 
5109         /* Get rid of the zone's kstats */
5110         zone_kstat_delete(zone);
5111 
5112         /* remove the pfexecd doors */
5113         if (zone->zone_pfexecd != NULL) {
5114                 klpd_freelist(&zone->zone_pfexecd);
5115                 zone->zone_pfexecd = NULL;
5116         }
5117 
5118         /* free brand specific data */
5119         if (ZONE_IS_BRANDED(zone))
5120                 ZBROP(zone)->b_free_brand_data(zone);
5121 
5122         /* Say goodbye to brand framework. */
5123         brand_unregister_zone(zone->zone_brand);
5124 
5125         /*
5126          * It is now safe to let the zone be recreated; remove it from the
5127          * lists.  The memory will not be freed until the last cred
5128          * reference goes away.
5129          */
5130         ASSERT(zonecount > 1);       /* must be > 1; can't destroy global zone */
5131         zonecount--;
5132         /* remove from active list and hash tables */
5133         list_remove(&zone_active, zone);
5134         (void) mod_hash_destroy(zonehashbyname,
5135             (mod_hash_key_t)zone->zone_name);
5136         (void) mod_hash_destroy(zonehashbyid,
5137             (mod_hash_key_t)(uintptr_t)zone->zone_id);
5138         if (zone->zone_flags & ZF_HASHED_LABEL)
5139                 (void) mod_hash_destroy(zonehashbylabel,
5140                     (mod_hash_key_t)zone->zone_slabel);
5141         mutex_exit(&zonehash_lock);
5142 
5143         /*
5144          * Release the root vnode; we're not using it anymore.  Nor should any
5145          * other thread that might access it exist.
5146          */
5147         if (zone->zone_rootvp != NULL) {
5148                 VN_RELE(zone->zone_rootvp);
5149                 zone->zone_rootvp = NULL;
5150         }
5151 
5152         /* add to deathrow list */
5153         mutex_enter(&zone_deathrow_lock);
5154         list_insert_tail(&zone_deathrow, zone);
5155         mutex_exit(&zone_deathrow_lock);
5156 
5157         /*
5158          * Drop last reference (which was added by zsched()), this will
5159          * free the zone unless there are outstanding cred references.
5160          */
5161         zone_rele(zone);
5162         return (0);
5163 }
5164 
5165 /*
5166  * Systemcall entry point for zone_getattr(2).
5167  */
5168 static ssize_t
5169 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5170 {
5171         size_t size;
5172         int error = 0, err;
5173         zone_t *zone;
5174         char *zonepath;
5175         char *outstr;
5176         zone_status_t zone_status;
5177         pid_t initpid;
5178         boolean_t global = (curzone == global_zone);
5179         boolean_t inzone = (curzone->zone_id == zoneid);
5180         ushort_t flags;
5181         zone_net_data_t *zbuf;
5182 
5183         mutex_enter(&zonehash_lock);
5184         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5185                 mutex_exit(&zonehash_lock);
5186                 return (set_errno(EINVAL));
5187         }
5188         zone_status = zone_status_get(zone);
5189         if (zone_status < ZONE_IS_INITIALIZED) {
5190                 mutex_exit(&zonehash_lock);
5191                 return (set_errno(EINVAL));
5192         }
5193         zone_hold(zone);
5194         mutex_exit(&zonehash_lock);
5195 
5196         /*
5197          * If not in the global zone, don't show information about other zones,
5198          * unless the system is labeled and the local zone's label dominates
5199          * the other zone.
5200          */
5201         if (!zone_list_access(zone)) {
5202                 zone_rele(zone);
5203                 return (set_errno(EINVAL));
5204         }
5205 
5206         switch (attr) {
5207         case ZONE_ATTR_ROOT:
5208                 if (global) {
5209                         /*
5210                          * Copy the path to trim the trailing "/" (except for
5211                          * the global zone).
5212                          */
5213                         if (zone != global_zone)
5214                                 size = zone->zone_rootpathlen - 1;
5215                         else
5216                                 size = zone->zone_rootpathlen;
5217                         zonepath = kmem_alloc(size, KM_SLEEP);
5218                         bcopy(zone->zone_rootpath, zonepath, size);
5219                         zonepath[size - 1] = '\0';
5220                 } else {
5221                         if (inzone || !is_system_labeled()) {
5222                                 /*
5223                                  * Caller is not in the global zone.
5224                                  * if the query is on the current zone
5225                                  * or the system is not labeled,
5226                                  * just return faked-up path for current zone.
5227                                  */
5228                                 zonepath = "/";
5229                                 size = 2;
5230                         } else {
5231                                 /*
5232                                  * Return related path for current zone.
5233                                  */
5234                                 int prefix_len = strlen(zone_prefix);
5235                                 int zname_len = strlen(zone->zone_name);
5236 
5237                                 size = prefix_len + zname_len + 1;
5238                                 zonepath = kmem_alloc(size, KM_SLEEP);
5239                                 bcopy(zone_prefix, zonepath, prefix_len);
5240                                 bcopy(zone->zone_name, zonepath +
5241                                     prefix_len, zname_len);
5242                                 zonepath[size - 1] = '\0';
5243                         }
5244                 }
5245                 if (bufsize > size)
5246                         bufsize = size;
5247                 if (buf != NULL) {
5248                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5249                         if (err != 0 && err != ENAMETOOLONG)
5250                                 error = EFAULT;
5251                 }
5252                 if (global || (is_system_labeled() && !inzone))
5253                         kmem_free(zonepath, size);
5254                 break;
5255 
5256         case ZONE_ATTR_NAME:
5257                 size = strlen(zone->zone_name) + 1;
5258                 if (bufsize > size)
5259                         bufsize = size;
5260                 if (buf != NULL) {
5261                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5262                         if (err != 0 && err != ENAMETOOLONG)
5263                                 error = EFAULT;
5264                 }
5265                 break;
5266 
5267         case ZONE_ATTR_STATUS:
5268                 /*
5269                  * Since we're not holding zonehash_lock, the zone status
5270                  * may be anything; leave it up to userland to sort it out.
5271                  */
5272                 size = sizeof (zone_status);
5273                 if (bufsize > size)
5274                         bufsize = size;
5275                 zone_status = zone_status_get(zone);
5276                 if (buf != NULL &&
5277                     copyout(&zone_status, buf, bufsize) != 0)
5278                         error = EFAULT;
5279                 break;
5280         case ZONE_ATTR_FLAGS:
5281                 size = sizeof (zone->zone_flags);
5282                 if (bufsize > size)
5283                         bufsize = size;
5284                 flags = zone->zone_flags;
5285                 if (buf != NULL &&
5286                     copyout(&flags, buf, bufsize) != 0)
5287                         error = EFAULT;
5288                 break;
5289         case ZONE_ATTR_PRIVSET:
5290                 size = sizeof (priv_set_t);
5291                 if (bufsize > size)
5292                         bufsize = size;
5293                 if (buf != NULL &&
5294                     copyout(zone->zone_privset, buf, bufsize) != 0)
5295                         error = EFAULT;
5296                 break;
5297         case ZONE_ATTR_UNIQID:
5298                 size = sizeof (zone->zone_uniqid);
5299                 if (bufsize > size)
5300                         bufsize = size;
5301                 if (buf != NULL &&
5302                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5303                         error = EFAULT;
5304                 break;
5305         case ZONE_ATTR_POOLID:
5306                 {
5307                         pool_t *pool;
5308                         poolid_t poolid;
5309 
5310                         if (pool_lock_intr() != 0) {
5311                                 error = EINTR;
5312                                 break;
5313                         }
5314                         pool = zone_pool_get(zone);
5315                         poolid = pool->pool_id;
5316                         pool_unlock();
5317                         size = sizeof (poolid);
5318                         if (bufsize > size)
5319                                 bufsize = size;
5320                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
5321                                 error = EFAULT;
5322                 }
5323                 break;
5324         case ZONE_ATTR_SLBL:
5325                 size = sizeof (bslabel_t);
5326                 if (bufsize > size)
5327                         bufsize = size;
5328                 if (zone->zone_slabel == NULL)
5329                         error = EINVAL;
5330                 else if (buf != NULL &&
5331                     copyout(label2bslabel(zone->zone_slabel), buf,
5332                     bufsize) != 0)
5333                         error = EFAULT;
5334                 break;
5335         case ZONE_ATTR_INITPID:
5336                 size = sizeof (initpid);
5337                 if (bufsize > size)
5338                         bufsize = size;
5339                 initpid = zone->zone_proc_initpid;
5340                 if (initpid == -1) {
5341                         error = ESRCH;
5342                         break;
5343                 }
5344                 if (buf != NULL &&
5345                     copyout(&initpid, buf, bufsize) != 0)
5346                         error = EFAULT;
5347                 break;
5348         case ZONE_ATTR_BRAND:
5349                 size = strlen(zone->zone_brand->b_name) + 1;
5350 
5351                 if (bufsize > size)
5352                         bufsize = size;
5353                 if (buf != NULL) {
5354                         err = copyoutstr(zone->zone_brand->b_name, buf,
5355                             bufsize, NULL);
5356                         if (err != 0 && err != ENAMETOOLONG)
5357                                 error = EFAULT;
5358                 }
5359                 break;
5360         case ZONE_ATTR_INITNAME:
5361                 size = strlen(zone->zone_initname) + 1;
5362                 if (bufsize > size)
5363                         bufsize = size;
5364                 if (buf != NULL) {
5365                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5366                             NULL);
5367                         if (err != 0 && err != ENAMETOOLONG)
5368                                 error = EFAULT;
5369                 }
5370                 break;
5371         case ZONE_ATTR_BOOTARGS:
5372                 if (zone->zone_bootargs == NULL)
5373                         outstr = "";
5374                 else
5375                         outstr = zone->zone_bootargs;
5376                 size = strlen(outstr) + 1;
5377                 if (bufsize > size)
5378                         bufsize = size;
5379                 if (buf != NULL) {
5380                         err = copyoutstr(outstr, buf, bufsize, NULL);
5381                         if (err != 0 && err != ENAMETOOLONG)
5382                                 error = EFAULT;
5383                 }
5384                 break;
5385         case ZONE_ATTR_PHYS_MCAP:
5386                 size = sizeof (zone->zone_phys_mcap);
5387                 if (bufsize > size)
5388                         bufsize = size;
5389                 if (buf != NULL &&
5390                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5391                         error = EFAULT;
5392                 break;
5393         case ZONE_ATTR_SCHED_CLASS:
5394                 mutex_enter(&class_lock);
5395 
5396                 if (zone->zone_defaultcid >= loaded_classes)
5397                         outstr = "";
5398                 else
5399                         outstr = sclass[zone->zone_defaultcid].cl_name;
5400                 size = strlen(outstr) + 1;
5401                 if (bufsize > size)
5402                         bufsize = size;
5403                 if (buf != NULL) {
5404                         err = copyoutstr(outstr, buf, bufsize, NULL);
5405                         if (err != 0 && err != ENAMETOOLONG)
5406                                 error = EFAULT;
5407                 }
5408 
5409                 mutex_exit(&class_lock);
5410                 break;
5411         case ZONE_ATTR_HOSTID:
5412                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5413                     bufsize == sizeof (zone->zone_hostid)) {
5414                         size = sizeof (zone->zone_hostid);
5415                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
5416                             bufsize) != 0)
5417                                 error = EFAULT;
5418                 } else {
5419                         error = EINVAL;
5420                 }
5421                 break;
5422         case ZONE_ATTR_FS_ALLOWED:
5423                 if (zone->zone_fs_allowed == NULL)
5424                         outstr = "";
5425                 else
5426                         outstr = zone->zone_fs_allowed;
5427                 size = strlen(outstr) + 1;
5428                 if (bufsize > size)
5429                         bufsize = size;
5430                 if (buf != NULL) {
5431                         err = copyoutstr(outstr, buf, bufsize, NULL);
5432                         if (err != 0 && err != ENAMETOOLONG)
5433                                 error = EFAULT;
5434                 }
5435                 break;
5436         case ZONE_ATTR_NETWORK:
5437                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5438                 if (copyin(buf, zbuf, bufsize) != 0) {
5439                         error = EFAULT;
5440                 } else {
5441                         error = zone_get_network(zoneid, zbuf);
5442                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5443                                 error = EFAULT;
5444                 }
5445                 kmem_free(zbuf, bufsize);
5446                 break;
5447         default:
5448                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5449                         size = bufsize;
5450                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5451                 } else {
5452                         error = EINVAL;
5453                 }
5454         }
5455         zone_rele(zone);
5456 
5457         if (error)
5458                 return (set_errno(error));
5459         return ((ssize_t)size);
5460 }
5461 
5462 /*
5463  * Systemcall entry point for zone_setattr(2).
5464  */
5465 /*ARGSUSED*/
5466 static int
5467 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5468 {
5469         zone_t *zone;
5470         zone_status_t zone_status;
5471         int err = -1;
5472         zone_net_data_t *zbuf;
5473 
5474         if (secpolicy_zone_config(CRED()) != 0)
5475                 return (set_errno(EPERM));
5476 
5477         /*
5478          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5479          * global zone.
5480          */
5481         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5482                 return (set_errno(EINVAL));
5483         }
5484 
5485         mutex_enter(&zonehash_lock);
5486         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5487                 mutex_exit(&zonehash_lock);
5488                 return (set_errno(EINVAL));
5489         }
5490         zone_hold(zone);
5491         mutex_exit(&zonehash_lock);
5492 
5493         /*
5494          * At present most attributes can only be set on non-running,
5495          * non-global zones.
5496          */
5497         zone_status = zone_status_get(zone);
5498         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5499                 err = EINVAL;
5500                 goto done;
5501         }
5502 
5503         switch (attr) {
5504         case ZONE_ATTR_INITNAME:
5505                 err = zone_set_initname(zone, (const char *)buf);
5506                 break;
5507         case ZONE_ATTR_BOOTARGS:
5508                 err = zone_set_bootargs(zone, (const char *)buf);
5509                 break;
5510         case ZONE_ATTR_BRAND:
5511                 err = zone_set_brand(zone, (const char *)buf);
5512                 break;
5513         case ZONE_ATTR_FS_ALLOWED:
5514                 err = zone_set_fs_allowed(zone, (const char *)buf);
5515                 break;
5516         case ZONE_ATTR_PHYS_MCAP:
5517                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5518                 break;
5519         case ZONE_ATTR_SCHED_CLASS:
5520                 err = zone_set_sched_class(zone, (const char *)buf);
5521                 break;
5522         case ZONE_ATTR_HOSTID:
5523                 if (bufsize == sizeof (zone->zone_hostid)) {
5524                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5525                                 err = 0;
5526                         else
5527                                 err = EFAULT;
5528                 } else {
5529                         err = EINVAL;
5530                 }
5531                 break;
5532         case ZONE_ATTR_NETWORK:
5533                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5534                         err = EINVAL;
5535                         break;
5536                 }
5537                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5538                 if (copyin(buf, zbuf, bufsize) != 0) {
5539                         kmem_free(zbuf, bufsize);
5540                         err = EFAULT;
5541                         break;
5542                 }
5543                 err = zone_set_network(zoneid, zbuf);
5544                 kmem_free(zbuf, bufsize);
5545                 break;
5546         default:
5547                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5548                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5549                 else
5550                         err = EINVAL;
5551         }
5552 
5553 done:
5554         zone_rele(zone);
5555         ASSERT(err != -1);
5556         return (err != 0 ? set_errno(err) : 0);
5557 }
5558 
5559 /*
5560  * Return zero if the process has at least one vnode mapped in to its
5561  * address space which shouldn't be allowed to change zones.
5562  *
5563  * Also return zero if the process has any shared mappings which reserve
5564  * swap.  This is because the counting for zone.max-swap does not allow swap
5565  * reservation to be shared between zones.  zone swap reservation is counted
5566  * on zone->zone_max_swap.
5567  */
5568 static int
5569 as_can_change_zones(void)
5570 {
5571         proc_t *pp = curproc;
5572         struct seg *seg;
5573         struct as *as = pp->p_as;
5574         vnode_t *vp;
5575         int allow = 1;
5576 
5577         ASSERT(pp->p_as != &kas);
5578         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
5579         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5580 
5581                 /*
5582                  * Cannot enter zone with shared anon memory which
5583                  * reserves swap.  See comment above.
5584                  */
5585                 if (seg_can_change_zones(seg) == B_FALSE) {
5586                         allow = 0;
5587                         break;
5588                 }
5589                 /*
5590                  * if we can't get a backing vnode for this segment then skip
5591                  * it.
5592                  */
5593                 vp = NULL;
5594                 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5595                         continue;
5596                 if (!vn_can_change_zones(vp)) { /* bail on first match */
5597                         allow = 0;
5598                         break;
5599                 }
5600         }
5601         AS_LOCK_EXIT(as, &as->a_lock);
5602         return (allow);
5603 }
5604 
5605 /*
5606  * Count swap reserved by curproc's address space
5607  */
5608 static size_t
5609 as_swresv(void)
5610 {
5611         proc_t *pp = curproc;
5612         struct seg *seg;
5613         struct as *as = pp->p_as;
5614         size_t swap = 0;
5615 
5616         ASSERT(pp->p_as != &kas);
5617         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
5618         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5619                 swap += seg_swresv(seg);
5620 
5621         return (swap);
5622 }
5623 
5624 /*
5625  * Systemcall entry point for zone_enter().
5626  *
5627  * The current process is injected into said zone.  In the process
5628  * it will change its project membership, privileges, rootdir/cwd,
5629  * zone-wide rctls, and pool association to match those of the zone.
5630  *
5631  * The first zone_enter() called while the zone is in the ZONE_IS_READY
5632  * state will transition it to ZONE_IS_RUNNING.  Processes may only
5633  * enter a zone that is "ready" or "running".
5634  */
5635 static int
5636 zone_enter(zoneid_t zoneid)
5637 {
5638         zone_t *zone;
5639         vnode_t *vp;
5640         proc_t *pp = curproc;
5641         contract_t *ct;
5642         cont_process_t *ctp;
5643         task_t *tk, *oldtk;
5644         kproject_t *zone_proj0;
5645         cred_t *cr, *newcr;
5646         pool_t *oldpool, *newpool;
5647         sess_t *sp;
5648         uid_t uid;
5649         zone_status_t status;
5650         int err = 0;
5651         rctl_entity_p_t e;
5652         size_t swap;
5653         kthread_id_t t;
5654 
5655         if (secpolicy_zone_config(CRED()) != 0)
5656                 return (set_errno(EPERM));
5657         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5658                 return (set_errno(EINVAL));
5659 
5660         /*
5661          * Stop all lwps so we don't need to hold a lock to look at
5662          * curproc->p_zone.  This needs to happen before we grab any
5663          * locks to avoid deadlock (another lwp in the process could
5664          * be waiting for the held lock).
5665          */
5666         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5667                 return (set_errno(EINTR));
5668 
5669         /*
5670          * Make sure we're not changing zones with files open or mapped in
5671          * to our address space which shouldn't be changing zones.
5672          */
5673         if (!files_can_change_zones()) {
5674                 err = EBADF;
5675                 goto out;
5676         }
5677         if (!as_can_change_zones()) {
5678                 err = EFAULT;
5679                 goto out;
5680         }
5681 
5682         mutex_enter(&zonehash_lock);
5683         if (pp->p_zone != global_zone) {
5684                 mutex_exit(&zonehash_lock);
5685                 err = EINVAL;
5686                 goto out;
5687         }
5688 
5689         zone = zone_find_all_by_id(zoneid);
5690         if (zone == NULL) {
5691                 mutex_exit(&zonehash_lock);
5692                 err = EINVAL;
5693                 goto out;
5694         }
5695 
5696         /*
5697          * To prevent processes in a zone from holding contracts on
5698          * extrazonal resources, and to avoid process contract
5699          * memberships which span zones, contract holders and processes
5700          * which aren't the sole members of their encapsulating process
5701          * contracts are not allowed to zone_enter.
5702          */
5703         ctp = pp->p_ct_process;
5704         ct = &ctp->conp_contract;
5705         mutex_enter(&ct->ct_lock);
5706         mutex_enter(&pp->p_lock);
5707         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5708                 mutex_exit(&pp->p_lock);
5709                 mutex_exit(&ct->ct_lock);
5710                 mutex_exit(&zonehash_lock);
5711                 err = EINVAL;
5712                 goto out;
5713         }
5714 
5715         /*
5716          * Moreover, we don't allow processes whose encapsulating
5717          * process contracts have inherited extrazonal contracts.
5718          * While it would be easier to eliminate all process contracts
5719          * with inherited contracts, we need to be able to give a
5720          * restarted init (or other zone-penetrating process) its
5721          * predecessor's contracts.
5722          */
5723         if (ctp->conp_ninherited != 0) {
5724                 contract_t *next;
5725                 list_for_each(&ctp->conp_inherited, next) {
5726                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
5727                                 mutex_exit(&pp->p_lock);
5728                                 mutex_exit(&ct->ct_lock);
5729                                 mutex_exit(&zonehash_lock);
5730                                 err = EINVAL;
5731                                 goto out;
5732                         }
5733                 }
5734         }
5735 
5736         mutex_exit(&pp->p_lock);
5737         mutex_exit(&ct->ct_lock);
5738 
5739         status = zone_status_get(zone);
5740         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5741                 /*
5742                  * Can't join
5743                  */
5744                 mutex_exit(&zonehash_lock);
5745                 err = EINVAL;
5746                 goto out;
5747         }
5748 
5749         /*
5750          * Make sure new priv set is within the permitted set for caller
5751          */
5752         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5753                 mutex_exit(&zonehash_lock);
5754                 err = EPERM;
5755                 goto out;
5756         }
5757         /*
5758          * We want to momentarily drop zonehash_lock while we optimistically
5759          * bind curproc to the pool it should be running in.  This is safe
5760          * since the zone can't disappear (we have a hold on it).
5761          */
5762         zone_hold(zone);
5763         mutex_exit(&zonehash_lock);
5764 
5765         /*
5766          * Grab pool_lock to keep the pools configuration from changing
5767          * and to stop ourselves from getting rebound to another pool
5768          * until we join the zone.
5769          */
5770         if (pool_lock_intr() != 0) {
5771                 zone_rele(zone);
5772                 err = EINTR;
5773                 goto out;
5774         }
5775         ASSERT(secpolicy_pool(CRED()) == 0);
5776         /*
5777          * Bind ourselves to the pool currently associated with the zone.
5778          */
5779         oldpool = curproc->p_pool;
5780         newpool = zone_pool_get(zone);
5781         if (pool_state == POOL_ENABLED && newpool != oldpool &&
5782             (err = pool_do_bind(newpool, P_PID, P_MYID,
5783             POOL_BIND_ALL)) != 0) {
5784                 pool_unlock();
5785                 zone_rele(zone);
5786                 goto out;
5787         }
5788 
5789         /*
5790          * Grab cpu_lock now; we'll need it later when we call
5791          * task_join().
5792          */
5793         mutex_enter(&cpu_lock);
5794         mutex_enter(&zonehash_lock);
5795         /*
5796          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5797          */
5798         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5799                 /*
5800                  * Can't join anymore.
5801                  */
5802                 mutex_exit(&zonehash_lock);
5803                 mutex_exit(&cpu_lock);
5804                 if (pool_state == POOL_ENABLED &&
5805                     newpool != oldpool)
5806                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
5807                             POOL_BIND_ALL);
5808                 pool_unlock();
5809                 zone_rele(zone);
5810                 err = EINVAL;
5811                 goto out;
5812         }
5813 
5814         /*
5815          * a_lock must be held while transfering locked memory and swap
5816          * reservation from the global zone to the non global zone because
5817          * asynchronous faults on the processes' address space can lock
5818          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5819          * segments respectively.
5820          */
5821         AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
5822         swap = as_swresv();
5823         mutex_enter(&pp->p_lock);
5824         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5825         /* verify that we do not exceed and task or lwp limits */
5826         mutex_enter(&zone->zone_nlwps_lock);
5827         /* add new lwps to zone and zone's proj0 */
5828         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5829         zone->zone_nlwps += pp->p_lwpcnt;
5830         /* add 1 task to zone's proj0 */
5831         zone_proj0->kpj_ntasks += 1;
5832 
5833         zone_proj0->kpj_nprocs++;
5834         zone->zone_nprocs++;
5835         mutex_exit(&zone->zone_nlwps_lock);
5836 
5837         mutex_enter(&zone->zone_mem_lock);
5838         zone->zone_locked_mem += pp->p_locked_mem;
5839         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5840         zone->zone_max_swap += swap;
5841         mutex_exit(&zone->zone_mem_lock);
5842 
5843         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5844         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5845         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5846 
5847         /* remove lwps and process from proc's old zone and old project */
5848         mutex_enter(&pp->p_zone->zone_nlwps_lock);
5849         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5850         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5851         pp->p_task->tk_proj->kpj_nprocs--;
5852         pp->p_zone->zone_nprocs--;
5853         mutex_exit(&pp->p_zone->zone_nlwps_lock);
5854 
5855         mutex_enter(&pp->p_zone->zone_mem_lock);
5856         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5857         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5858         pp->p_zone->zone_max_swap -= swap;
5859         mutex_exit(&pp->p_zone->zone_mem_lock);
5860 
5861         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5862         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5863         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5864 
5865         pp->p_flag |= SZONETOP;
5866         pp->p_zone = zone;
5867         mutex_exit(&pp->p_lock);
5868         AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
5869 
5870         /*
5871          * Joining the zone cannot fail from now on.
5872          *
5873          * This means that a lot of the following code can be commonized and
5874          * shared with zsched().
5875          */
5876 
5877         /*
5878          * If the process contract fmri was inherited, we need to
5879          * flag this so that any contract status will not leak
5880          * extra zone information, svc_fmri in this case
5881          */
5882         if (ctp->conp_svc_ctid != ct->ct_id) {
5883                 mutex_enter(&ct->ct_lock);
5884                 ctp->conp_svc_zone_enter = ct->ct_id;
5885                 mutex_exit(&ct->ct_lock);
5886         }
5887 
5888         /*
5889          * Reset the encapsulating process contract's zone.
5890          */
5891         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
5892         contract_setzuniqid(ct, zone->zone_uniqid);
5893 
5894         /*
5895          * Create a new task and associate the process with the project keyed
5896          * by (projid,zoneid).
5897          *
5898          * We might as well be in project 0; the global zone's projid doesn't
5899          * make much sense in a zone anyhow.
5900          *
5901          * This also increments zone_ntasks, and returns with p_lock held.
5902          */
5903         tk = task_create(0, zone);
5904         oldtk = task_join(tk, 0);
5905         mutex_exit(&cpu_lock);
5906 
5907         /*
5908          * call RCTLOP_SET functions on this proc
5909          */
5910         e.rcep_p.zone = zone;
5911         e.rcep_t = RCENTITY_ZONE;
5912         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
5913             RCD_CALLBACK);
5914         mutex_exit(&pp->p_lock);
5915 
5916         /*
5917          * We don't need to hold any of zsched's locks here; not only do we know
5918          * the process and zone aren't going away, we know its session isn't
5919          * changing either.
5920          *
5921          * By joining zsched's session here, we mimic the behavior in the
5922          * global zone of init's sid being the pid of sched.  We extend this
5923          * to all zlogin-like zone_enter()'ing processes as well.
5924          */
5925         mutex_enter(&pidlock);
5926         sp = zone->zone_zsched->p_sessp;
5927         sess_hold(zone->zone_zsched);
5928         mutex_enter(&pp->p_lock);
5929         pgexit(pp);
5930         sess_rele(pp->p_sessp, B_TRUE);
5931         pp->p_sessp = sp;
5932         pgjoin(pp, zone->zone_zsched->p_pidp);
5933 
5934         /*
5935          * If any threads are scheduled to be placed on zone wait queue they
5936          * should abandon the idea since the wait queue is changing.
5937          * We need to be holding pidlock & p_lock to do this.
5938          */
5939         if ((t = pp->p_tlist) != NULL) {
5940                 do {
5941                         thread_lock(t);
5942                         /*
5943                          * Kick this thread so that he doesn't sit
5944                          * on a wrong wait queue.
5945                          */
5946                         if (ISWAITING(t))
5947                                 setrun_locked(t);
5948 
5949                         if (t->t_schedflag & TS_ANYWAITQ)
5950                                 t->t_schedflag &= ~ TS_ANYWAITQ;
5951 
5952                         thread_unlock(t);
5953                 } while ((t = t->t_forw) != pp->p_tlist);
5954         }
5955 
5956         /*
5957          * If there is a default scheduling class for the zone and it is not
5958          * the class we are currently in, change all of the threads in the
5959          * process to the new class.  We need to be holding pidlock & p_lock
5960          * when we call parmsset so this is a good place to do it.
5961          */
5962         if (zone->zone_defaultcid > 0 &&
5963             zone->zone_defaultcid != curthread->t_cid) {
5964                 pcparms_t pcparms;
5965 
5966                 pcparms.pc_cid = zone->zone_defaultcid;
5967                 pcparms.pc_clparms[0] = 0;
5968 
5969                 /*
5970                  * If setting the class fails, we still want to enter the zone.
5971                  */
5972                 if ((t = pp->p_tlist) != NULL) {
5973                         do {
5974                                 (void) parmsset(&pcparms, t);
5975                         } while ((t = t->t_forw) != pp->p_tlist);
5976                 }
5977         }
5978 
5979         mutex_exit(&pp->p_lock);
5980         mutex_exit(&pidlock);
5981 
5982         mutex_exit(&zonehash_lock);
5983         /*
5984          * We're firmly in the zone; let pools progress.
5985          */
5986         pool_unlock();
5987         task_rele(oldtk);
5988         /*
5989          * We don't need to retain a hold on the zone since we already
5990          * incremented zone_ntasks, so the zone isn't going anywhere.
5991          */
5992         zone_rele(zone);
5993 
5994         /*
5995          * Chroot
5996          */
5997         vp = zone->zone_rootvp;
5998         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
5999         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6000 
6001         /*
6002          * Change process credentials
6003          */
6004         newcr = cralloc();
6005         mutex_enter(&pp->p_crlock);
6006         cr = pp->p_cred;
6007         crcopy_to(cr, newcr);
6008         crsetzone(newcr, zone);
6009         pp->p_cred = newcr;
6010 
6011         /*
6012          * Restrict all process privilege sets to zone limit
6013          */
6014         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6015         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6016         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6017         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6018         mutex_exit(&pp->p_crlock);
6019         crset(pp, newcr);
6020 
6021         /*
6022          * Adjust upcount to reflect zone entry.
6023          */
6024         uid = crgetruid(newcr);
6025         mutex_enter(&pidlock);
6026         upcount_dec(uid, GLOBAL_ZONEID);
6027         upcount_inc(uid, zoneid);
6028         mutex_exit(&pidlock);
6029 
6030         /*
6031          * Set up core file path and content.
6032          */
6033         set_core_defaults();
6034 
6035 out:
6036         /*
6037          * Let the other lwps continue.
6038          */
6039         mutex_enter(&pp->p_lock);
6040         if (curthread != pp->p_agenttp)
6041                 continuelwps(pp);
6042         mutex_exit(&pp->p_lock);
6043 
6044         return (err != 0 ? set_errno(err) : 0);
6045 }
6046 
6047 /*
6048  * Systemcall entry point for zone_list(2).
6049  *
6050  * Processes running in a (non-global) zone only see themselves.
6051  * On labeled systems, they see all zones whose label they dominate.
6052  */
6053 static int
6054 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6055 {
6056         zoneid_t *zoneids;
6057         zone_t *zone, *myzone;
6058         uint_t user_nzones, real_nzones;
6059         uint_t domi_nzones;
6060         int error;
6061 
6062         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6063                 return (set_errno(EFAULT));
6064 
6065         myzone = curproc->p_zone;
6066         if (myzone != global_zone) {
6067                 bslabel_t *mybslab;
6068 
6069                 if (!is_system_labeled()) {
6070                         /* just return current zone */
6071                         real_nzones = domi_nzones = 1;
6072                         zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6073                         zoneids[0] = myzone->zone_id;
6074                 } else {
6075                         /* return all zones that are dominated */
6076                         mutex_enter(&zonehash_lock);
6077                         real_nzones = zonecount;
6078                         domi_nzones = 0;
6079                         if (real_nzones > 0) {
6080                                 zoneids = kmem_alloc(real_nzones *
6081                                     sizeof (zoneid_t), KM_SLEEP);
6082                                 mybslab = label2bslabel(myzone->zone_slabel);
6083                                 list_for_each(&zone_active, zone) {
6084                                         if (zone->zone_id == GLOBAL_ZONEID)
6085                                                 continue;
6086                                         if (zone != myzone &&
6087                                             (zone->zone_flags & ZF_IS_SCRATCH))
6088                                                 continue;
6089                                         /*
6090                                          * Note that a label always dominates
6091                                          * itself, so myzone is always included
6092                                          * in the list.
6093                                          */
6094                                         if (bldominates(mybslab,
6095                                             label2bslabel(zone->zone_slabel))) {
6096                                                 zoneids[domi_nzones++] =
6097                                                     zone->zone_id;
6098                                         }
6099                                 }
6100                         }
6101                         mutex_exit(&zonehash_lock);
6102                 }
6103         } else {
6104                 mutex_enter(&zonehash_lock);
6105                 real_nzones = zonecount;
6106                 domi_nzones = 0;
6107                 if (real_nzones > 0) {
6108                         zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6109                             KM_SLEEP);
6110                         list_for_each(&zone_active, zone)
6111                                 zoneids[domi_nzones++] = zone->zone_id;
6112                         ASSERT(domi_nzones == real_nzones);
6113                 }
6114                 mutex_exit(&zonehash_lock);
6115         }
6116 
6117         /*
6118          * If user has allocated space for fewer entries than we found, then
6119          * return only up to his limit.  Either way, tell him exactly how many
6120          * we found.
6121          */
6122         if (domi_nzones < user_nzones)
6123                 user_nzones = domi_nzones;
6124         error = 0;
6125         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6126                 error = EFAULT;
6127         } else if (zoneidlist != NULL && user_nzones != 0) {
6128                 if (copyout(zoneids, zoneidlist,
6129                     user_nzones * sizeof (zoneid_t)) != 0)
6130                         error = EFAULT;
6131         }
6132 
6133         if (real_nzones > 0)
6134                 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6135 
6136         if (error != 0)
6137                 return (set_errno(error));
6138         else
6139                 return (0);
6140 }
6141 
6142 /*
6143  * Systemcall entry point for zone_lookup(2).
6144  *
6145  * Non-global zones are only able to see themselves and (on labeled systems)
6146  * the zones they dominate.
6147  */
6148 static zoneid_t
6149 zone_lookup(const char *zone_name)
6150 {
6151         char *kname;
6152         zone_t *zone;
6153         zoneid_t zoneid;
6154         int err;
6155 
6156         if (zone_name == NULL) {
6157                 /* return caller's zone id */
6158                 return (getzoneid());
6159         }
6160 
6161         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6162         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6163                 kmem_free(kname, ZONENAME_MAX);
6164                 return (set_errno(err));
6165         }
6166 
6167         mutex_enter(&zonehash_lock);
6168         zone = zone_find_all_by_name(kname);
6169         kmem_free(kname, ZONENAME_MAX);
6170         /*
6171          * In a non-global zone, can only lookup global and own name.
6172          * In Trusted Extensions zone label dominance rules apply.
6173          */
6174         if (zone == NULL ||
6175             zone_status_get(zone) < ZONE_IS_READY ||
6176             !zone_list_access(zone)) {
6177                 mutex_exit(&zonehash_lock);
6178                 return (set_errno(EINVAL));
6179         } else {
6180                 zoneid = zone->zone_id;
6181                 mutex_exit(&zonehash_lock);
6182                 return (zoneid);
6183         }
6184 }
6185 
6186 static int
6187 zone_version(int *version_arg)
6188 {
6189         int version = ZONE_SYSCALL_API_VERSION;
6190 
6191         if (copyout(&version, version_arg, sizeof (int)) != 0)
6192                 return (set_errno(EFAULT));
6193         return (0);
6194 }
6195 
6196 /* ARGSUSED */
6197 long
6198 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6199 {
6200         zone_def zs;
6201         int err;
6202 
6203         switch (cmd) {
6204         case ZONE_CREATE:
6205                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6206                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6207                                 return (set_errno(EFAULT));
6208                         }
6209                 } else {
6210 #ifdef _SYSCALL32_IMPL
6211                         zone_def32 zs32;
6212 
6213                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6214                                 return (set_errno(EFAULT));
6215                         }
6216                         zs.zone_name =
6217                             (const char *)(unsigned long)zs32.zone_name;
6218                         zs.zone_root =
6219                             (const char *)(unsigned long)zs32.zone_root;
6220                         zs.zone_privs =
6221                             (const struct priv_set *)
6222                             (unsigned long)zs32.zone_privs;
6223                         zs.zone_privssz = zs32.zone_privssz;
6224                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6225                         zs.rctlbufsz = zs32.rctlbufsz;
6226                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6227                         zs.zfsbufsz = zs32.zfsbufsz;
6228                         zs.extended_error =
6229                             (int *)(unsigned long)zs32.extended_error;
6230                         zs.match = zs32.match;
6231                         zs.doi = zs32.doi;
6232                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6233                         zs.flags = zs32.flags;
6234 #else
6235                         panic("get_udatamodel() returned bogus result\n");
6236 #endif
6237                 }
6238 
6239                 return (zone_create(zs.zone_name, zs.zone_root,
6240                     zs.zone_privs, zs.zone_privssz,
6241                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6242                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6243                     zs.extended_error, zs.match, zs.doi,
6244                     zs.label, zs.flags));
6245         case ZONE_BOOT:
6246                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6247         case ZONE_DESTROY:
6248                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6249         case ZONE_GETATTR:
6250                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6251                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6252         case ZONE_SETATTR:
6253                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6254                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6255         case ZONE_ENTER:
6256                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6257         case ZONE_LIST:
6258                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6259         case ZONE_SHUTDOWN:
6260                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6261         case ZONE_LOOKUP:
6262                 return (zone_lookup((const char *)arg1));
6263         case ZONE_VERSION:
6264                 return (zone_version((int *)arg1));
6265         case ZONE_ADD_DATALINK:
6266                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6267                     (datalink_id_t)(uintptr_t)arg2));
6268         case ZONE_DEL_DATALINK:
6269                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6270                     (datalink_id_t)(uintptr_t)arg2));
6271         case ZONE_CHECK_DATALINK: {
6272                 zoneid_t        zoneid;
6273                 boolean_t       need_copyout;
6274 
6275                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6276                         return (EFAULT);
6277                 need_copyout = (zoneid == ALL_ZONES);
6278                 err = zone_check_datalink(&zoneid,
6279                     (datalink_id_t)(uintptr_t)arg2);
6280                 if (err == 0 && need_copyout) {
6281                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6282                                 err = EFAULT;
6283                 }
6284                 return (err == 0 ? 0 : set_errno(err));
6285         }
6286         case ZONE_LIST_DATALINK:
6287                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6288                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6289         default:
6290                 return (set_errno(EINVAL));
6291         }
6292 }
6293 
6294 struct zarg {
6295         zone_t *zone;
6296         zone_cmd_arg_t arg;
6297 };
6298 
6299 static int
6300 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6301 {
6302         char *buf;
6303         size_t buflen;
6304         int error;
6305 
6306         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6307         buf = kmem_alloc(buflen, KM_SLEEP);
6308         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6309         error = door_ki_open(buf, doorp);
6310         kmem_free(buf, buflen);
6311         return (error);
6312 }
6313 
6314 static void
6315 zone_release_door(door_handle_t *doorp)
6316 {
6317         door_ki_rele(*doorp);
6318         *doorp = NULL;
6319 }
6320 
6321 static void
6322 zone_ki_call_zoneadmd(struct zarg *zargp)
6323 {
6324         door_handle_t door = NULL;
6325         door_arg_t darg, save_arg;
6326         char *zone_name;
6327         size_t zone_namelen;
6328         zoneid_t zoneid;
6329         zone_t *zone;
6330         zone_cmd_arg_t arg;
6331         uint64_t uniqid;
6332         size_t size;
6333         int error;
6334         int retry;
6335 
6336         zone = zargp->zone;
6337         arg = zargp->arg;
6338         kmem_free(zargp, sizeof (*zargp));
6339 
6340         zone_namelen = strlen(zone->zone_name) + 1;
6341         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6342         bcopy(zone->zone_name, zone_name, zone_namelen);
6343         zoneid = zone->zone_id;
6344         uniqid = zone->zone_uniqid;
6345         /*
6346          * zoneadmd may be down, but at least we can empty out the zone.
6347          * We can ignore the return value of zone_empty() since we're called
6348          * from a kernel thread and know we won't be delivered any signals.
6349          */
6350         ASSERT(curproc == &p0);
6351         (void) zone_empty(zone);
6352         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6353         zone_rele(zone);
6354 
6355         size = sizeof (arg);
6356         darg.rbuf = (char *)&arg;
6357         darg.data_ptr = (char *)&arg;
6358         darg.rsize = size;
6359         darg.data_size = size;
6360         darg.desc_ptr = NULL;
6361         darg.desc_num = 0;
6362 
6363         save_arg = darg;
6364         /*
6365          * Since we're not holding a reference to the zone, any number of
6366          * things can go wrong, including the zone disappearing before we get a
6367          * chance to talk to zoneadmd.
6368          */
6369         for (retry = 0; /* forever */; retry++) {
6370                 if (door == NULL &&
6371                     (error = zone_lookup_door(zone_name, &door)) != 0) {
6372                         goto next;
6373                 }
6374                 ASSERT(door != NULL);
6375 
6376                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
6377                     SIZE_MAX, 0)) == 0) {
6378                         break;
6379                 }
6380                 switch (error) {
6381                 case EINTR:
6382                         /* FALLTHROUGH */
6383                 case EAGAIN:    /* process may be forking */
6384                         /*
6385                          * Back off for a bit
6386                          */
6387                         break;
6388                 case EBADF:
6389                         zone_release_door(&door);
6390                         if (zone_lookup_door(zone_name, &door) != 0) {
6391                                 /*
6392                                  * zoneadmd may be dead, but it may come back to
6393                                  * life later.
6394                                  */
6395                                 break;
6396                         }
6397                         break;
6398                 default:
6399                         cmn_err(CE_WARN,
6400                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6401                             error);
6402                         goto out;
6403                 }
6404 next:
6405                 /*
6406                  * If this isn't the same zone_t that we originally had in mind,
6407                  * then this is the same as if two kadmin requests come in at
6408                  * the same time: the first one wins.  This means we lose, so we
6409                  * bail.
6410                  */
6411                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
6412                         /*
6413                          * Problem is solved.
6414                          */
6415                         break;
6416                 }
6417                 if (zone->zone_uniqid != uniqid) {
6418                         /*
6419                          * zoneid recycled
6420                          */
6421                         zone_rele(zone);
6422                         break;
6423                 }
6424                 /*
6425                  * We could zone_status_timedwait(), but there doesn't seem to
6426                  * be much point in doing that (plus, it would mean that
6427                  * zone_free() isn't called until this thread exits).
6428                  */
6429                 zone_rele(zone);
6430                 delay(hz);
6431                 darg = save_arg;
6432         }
6433 out:
6434         if (door != NULL) {
6435                 zone_release_door(&door);
6436         }
6437         kmem_free(zone_name, zone_namelen);
6438         thread_exit();
6439 }
6440 
6441 /*
6442  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6443  * kadmin().  The caller is a process in the zone.
6444  *
6445  * In order to shutdown the zone, we will hand off control to zoneadmd
6446  * (running in the global zone) via a door.  We do a half-hearted job at
6447  * killing all processes in the zone, create a kernel thread to contact
6448  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6449  * a form of generation number used to let zoneadmd (as well as
6450  * zone_destroy()) know exactly which zone they're re talking about.
6451  */
6452 int
6453 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6454 {
6455         struct zarg *zargp;
6456         zone_cmd_t zcmd;
6457         zone_t *zone;
6458 
6459         zone = curproc->p_zone;
6460         ASSERT(getzoneid() != GLOBAL_ZONEID);
6461 
6462         switch (cmd) {
6463         case A_SHUTDOWN:
6464                 switch (fcn) {
6465                 case AD_HALT:
6466                 case AD_POWEROFF:
6467                         zcmd = Z_HALT;
6468                         break;
6469                 case AD_BOOT:
6470                         zcmd = Z_REBOOT;
6471                         break;
6472                 case AD_IBOOT:
6473                 case AD_SBOOT:
6474                 case AD_SIBOOT:
6475                 case AD_NOSYNC:
6476                         return (ENOTSUP);
6477                 default:
6478                         return (EINVAL);
6479                 }
6480                 break;
6481         case A_REBOOT:
6482                 zcmd = Z_REBOOT;
6483                 break;
6484         case A_FTRACE:
6485         case A_REMOUNT:
6486         case A_FREEZE:
6487         case A_DUMP:
6488         case A_CONFIG:
6489                 return (ENOTSUP);
6490         default:
6491                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6492                 return (EINVAL);
6493         }
6494 
6495         if (secpolicy_zone_admin(credp, B_FALSE))
6496                 return (EPERM);
6497         mutex_enter(&zone_status_lock);
6498 
6499         /*
6500          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6501          * is in the zone.
6502          */
6503         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6504         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6505                 /*
6506                  * This zone is already on its way down.
6507                  */
6508                 mutex_exit(&zone_status_lock);
6509                 return (0);
6510         }
6511         /*
6512          * Prevent future zone_enter()s
6513          */
6514         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6515         mutex_exit(&zone_status_lock);
6516 
6517         /*
6518          * Kill everyone now and call zoneadmd later.
6519          * zone_ki_call_zoneadmd() will do a more thorough job of this
6520          * later.
6521          */
6522         killall(zone->zone_id);
6523         /*
6524          * Now, create the thread to contact zoneadmd and do the rest of the
6525          * work.  This thread can't be created in our zone otherwise
6526          * zone_destroy() would deadlock.
6527          */
6528         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6529         zargp->arg.cmd = zcmd;
6530         zargp->arg.uniqid = zone->zone_uniqid;
6531         zargp->zone = zone;
6532         (void) strcpy(zargp->arg.locale, "C");
6533         /* mdep was already copied in for us by uadmin */
6534         if (mdep != NULL)
6535                 (void) strlcpy(zargp->arg.bootbuf, mdep,
6536                     sizeof (zargp->arg.bootbuf));
6537         zone_hold(zone);
6538 
6539         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6540             TS_RUN, minclsyspri);
6541         exit(CLD_EXITED, 0);
6542 
6543         return (EINVAL);
6544 }
6545 
6546 /*
6547  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6548  * status to ZONE_IS_SHUTTING_DOWN.
6549  *
6550  * This function also shuts down all running zones to ensure that they won't
6551  * fork new processes.
6552  */
6553 void
6554 zone_shutdown_global(void)
6555 {
6556         zone_t *current_zonep;
6557 
6558         ASSERT(INGLOBALZONE(curproc));
6559         mutex_enter(&zonehash_lock);
6560         mutex_enter(&zone_status_lock);
6561 
6562         /* Modify the global zone's status first. */
6563         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6564         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6565 
6566         /*
6567          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6568          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6569          * could cause assertions to fail (e.g., assertions about a zone's
6570          * state during initialization, readying, or booting) or produce races.
6571          * We'll let threads continue to initialize and ready new zones: they'll
6572          * fail to boot the new zones when they see that the global zone is
6573          * shutting down.
6574          */
6575         list_for_each(&zone_active, cpurrent_zonep) {
6576                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6577                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6578         }
6579         mutex_exit(&zone_status_lock);
6580         mutex_exit(&zonehash_lock);
6581 }
6582 
6583 /*
6584  * Returns true if the named dataset is visible in the current zone.
6585  * The 'write' parameter is set to 1 if the dataset is also writable.
6586  */
6587 int
6588 zone_dataset_visible(const char *dataset, int *write)
6589 {
6590         static int zfstype = -1;
6591         zone_dataset_t *zd;
6592         size_t len;
6593         zone_t *zone = curproc->p_zone;
6594         const char *name = NULL;
6595         vfs_t *vfsp = NULL;
6596 
6597         if (dataset[0] == '\0')
6598                 return (0);
6599 
6600         /*
6601          * Walk the list once, looking for datasets which match exactly, or
6602          * specify a dataset underneath an exported dataset.  If found, return
6603          * true and note that it is writable.
6604          */
6605         list_for_each(&zone->zone_datasets, zd) {
6606                 len = strlen(zd->zd_dataset);
6607                 if (strlen(dataset) >= len &&
6608                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6609                     (dataset[len] == '\0' || dataset[len] == '/' ||
6610                     dataset[len] == '@')) {
6611                         if (write)
6612                                 *write = 1;
6613                         return (1);
6614                 }
6615         }
6616 
6617         /*
6618          * Walk the list a second time, searching for datasets which are parents
6619          * of exported datasets.  These should be visible, but read-only.
6620          *
6621          * Note that we also have to support forms such as 'pool/dataset/', with
6622          * a trailing slash.
6623          */
6624         list_for_each(&zone->zone_dataset, zd) {
6625                 len = strlen(dataset);
6626                 if (dataset[len - 1] == '/')
6627                         len--;  /* Ignore trailing slash */
6628                 if (len < strlen(zd->zd_dataset) &&
6629                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6630                     zd->zd_dataset[len] == '/') {
6631                         if (write)
6632                                 *write = 0;
6633                         return (1);
6634                 }
6635         }
6636 
6637         /*
6638          * We reach here if the given dataset is not found in the zone_dataset
6639          * list. Check if this dataset was added as a filesystem (ie. "add fs")
6640          * instead of delegation. For this we search for the dataset in the
6641          * zone_vfslist of this zone. If found, return true and note that it is
6642          * not writable.
6643          */
6644 
6645         /*
6646          * Initialize zfstype if it is not initialized yet.
6647          */
6648         if (zfstype == -1) {
6649                 struct vfssw *vswp = vfs_getvfssw("zfs");
6650                 zfstype = vswp - vfssw;
6651                 vfs_unrefvfssw(vswp);
6652         }
6653 
6654         vfs_list_read_lock();
6655         vfsp = zone->zone_vfslist;
6656         do {
6657                 ASSERT(vfsp);
6658                 if (vfsp->vfs_fstype == zfstype) {
6659                         name = refstr_value(vfsp->vfs_resource);
6660 
6661                         /*
6662                          * Check if we have an exact match.
6663                          */
6664                         if (strcmp(dataset, name) == 0) {
6665                                 vfs_list_unlock();
6666                                 if (write)
6667                                         *write = 0;
6668                                 return (1);
6669                         }
6670                         /*
6671                          * We need to check if we are looking for parents of
6672                          * a dataset. These should be visible, but read-only.
6673                          */
6674                         len = strlen(dataset);
6675                         if (dataset[len - 1] == '/')
6676                                 len--;
6677 
6678                         if (len < strlen(name) &&
6679                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
6680                                 vfs_list_unlock();
6681                                 if (write)
6682                                         *write = 0;
6683                                 return (1);
6684                         }
6685                 }
6686                 vfsp = vfsp->vfs_zone_next;
6687         } while (vfsp != zone->zone_vfslist);
6688 
6689         vfs_list_unlock();
6690         return (0);
6691 }
6692 
6693 /*
6694  * zone_find_by_any_path() -
6695  *
6696  * kernel-private routine similar to zone_find_by_path(), but which
6697  * effectively compares against zone paths rather than zonerootpath
6698  * (i.e., the last component of zonerootpaths, which should be "root/",
6699  * are not compared.)  This is done in order to accurately identify all
6700  * paths, whether zone-visible or not, including those which are parallel
6701  * to /root/, such as /dev/, /home/, etc...
6702  *
6703  * If the specified path does not fall under any zone path then global
6704  * zone is returned.
6705  *
6706  * The treat_abs parameter indicates whether the path should be treated as
6707  * an absolute path although it does not begin with "/".  (This supports
6708  * nfs mount syntax such as host:any/path.)
6709  *
6710  * The caller is responsible for zone_rele of the returned zone.
6711  */
6712 zone_t *
6713 zone_find_by_any_path(const char *path, boolean_t treat_abs)
6714 {
6715         zone_t *zone;
6716         int path_offset = 0;
6717 
6718         if (path == NULL) {
6719                 zone_hold(global_zone);
6720                 return (global_zone);
6721         }
6722 
6723         if (*path != '/') {
6724                 ASSERT(treat_abs);
6725                 path_offset = 1;
6726         }
6727 
6728         mutex_enter(&zonehash_lock);
6729         list_for_each(&zone_active, zone) {
6730                 char    *c;
6731                 size_t  pathlen;
6732                 char *rootpath_start;
6733 
6734                 if (zone == global_zone)        /* skip global zone */
6735                         continue;
6736 
6737                 /* scan backwards to find start of last component */
6738                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6739                 do {
6740                         c--;
6741                 } while (*c != '/');
6742 
6743                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
6744                 rootpath_start = (zone->zone_rootpath + path_offset);
6745                 if (strncmp(path, rootpath_start, pathlen) == 0)
6746                         break;
6747         }
6748         if (zone == NULL)
6749                 zone = global_zone;
6750         zone_hold(zone);
6751         mutex_exit(&zonehash_lock);
6752         return (zone);
6753 }
6754 
6755 /*
6756  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6757  * zone_dl_t pointer if found, and NULL otherwise.
6758  */
6759 static zone_dl_t *
6760 zone_find_dl(zone_t *zone, datalink_id_t linkid)
6761 {
6762         zone_dl_t *zdl;
6763 
6764         ASSERT(mutex_owned(&zone->zone_lock));
6765         list_for_each(&zone->zone_dl_list, zdl) {
6766                 if (zdl->zdl_id == linkid)
6767                         break;
6768         }
6769         return (zdl);
6770 }
6771 
6772 static boolean_t
6773 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6774 {
6775         boolean_t exists;
6776 
6777         mutex_enter(&zone->zone_lock);
6778         exists = (zone_find_dl(zone, linkid) != NULL);
6779         mutex_exit(&zone->zone_lock);
6780         return (exists);
6781 }
6782 
6783 /*
6784  * Add an data link name for the zone.
6785  */
6786 static int
6787 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6788 {
6789         zone_dl_t *zdl;
6790         zone_t *zone;
6791         zone_t *thiszone;
6792 
6793         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6794                 return (set_errno(ENXIO));
6795 
6796         /* Verify that the datalink ID doesn't already belong to a zone. */
6797         mutex_enter(&zonehash_lock);
6798         list_for_each(&zone_active, zone) {
6799                 if (zone_dl_exists(zone, linkid)) {
6800                         mutex_exit(&zonehash_lock);
6801                         zone_rele(thiszone);
6802                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6803                 }
6804         }
6805 
6806         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6807         zdl->zdl_id = linkid;
6808         zdl->zdl_net = NULL;
6809         mutex_enter(&thiszone->zone_lock);
6810         list_insert_head(&thiszone->zone_dl_list, zdl);
6811         mutex_exit(&thiszone->zone_lock);
6812         mutex_exit(&zonehash_lock);
6813         zone_rele(thiszone);
6814         return (0);
6815 }
6816 
6817 static int
6818 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
6819 {
6820         zone_dl_t *zdl;
6821         zone_t *zone;
6822         int err = 0;
6823 
6824         if ((zone = zone_find_by_id(zoneid)) == NULL)
6825                 return (set_errno(EINVAL));
6826 
6827         mutex_enter(&zone->zone_lock);
6828         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6829                 err = ENXIO;
6830         } else {
6831                 list_remove(&zone->zone_dl_list, zdl);
6832                 if (zdl->zdl_net != NULL)
6833                         nvlist_free(zdl->zdl_net);
6834                 kmem_free(zdl, sizeof (zone_dl_t));
6835         }
6836         mutex_exit(&zone->zone_lock);
6837         zone_rele(zone);
6838         return (err == 0 ? 0 : set_errno(err));
6839 }
6840 
6841 /*
6842  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
6843  * the linkid.  Otherwise we just check if the specified zoneidp has been
6844  * assigned the supplied linkid.
6845  */
6846 int
6847 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
6848 {
6849         zone_t *zone;
6850         int err = ENXIO;
6851 
6852         if (*zoneidp != ALL_ZONES) {
6853                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
6854                         if (zone_dl_exists(zone, linkid))
6855                                 err = 0;
6856                         zone_rele(zone);
6857                 }
6858                 return (err);
6859         }
6860 
6861         mutex_enter(&zonehash_lock);
6862         list_for_each(&zone_active, zone) {
6863                 if (zone_dl_exists(zone, linkid)) {
6864                         *zoneidp = zone->zone_id;
6865                         err = 0;
6866                         break;
6867                 }
6868         }
6869         mutex_exit(&zonehash_lock);
6870         return (err);
6871 }
6872 
6873 /*
6874  * Get the list of datalink IDs assigned to a zone.
6875  *
6876  * On input, *nump is the number of datalink IDs that can fit in the supplied
6877  * idarray.  Upon return, *nump is either set to the number of datalink IDs
6878  * that were placed in the array if the array was large enough, or to the
6879  * number of datalink IDs that the function needs to place in the array if the
6880  * array is too small.
6881  */
6882 static int
6883 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
6884 {
6885         uint_t num, dlcount;
6886         zone_t *zone;
6887         zone_dl_t *zdl;
6888         datalink_id_t *idptr = idarray;
6889 
6890         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
6891                 return (set_errno(EFAULT));
6892         if ((zone = zone_find_by_id(zoneid)) == NULL)
6893                 return (set_errno(ENXIO));
6894 
6895         num = 0;
6896         mutex_enter(&zone->zone_lock);
6897         list_for_each(&zone->zone_dl_list, zdl) {
6898                 /*
6899                  * If the list is bigger than what the caller supplied, just
6900                  * count, don't do copyout.
6901                  */
6902                 if (++num > dlcount)
6903                         continue;
6904                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
6905                         mutex_exit(&zone->zone_lock);
6906                         zone_rele(zone);
6907                         return (set_errno(EFAULT));
6908                 }
6909                 idptr++;
6910         }
6911         mutex_exit(&zone->zone_lock);
6912         zone_rele(zone);
6913 
6914         /* Increased or decreased, caller should be notified. */
6915         if (num != dlcount) {
6916                 if (copyout(&num, nump, sizeof (num)) != 0)
6917                         return (set_errno(EFAULT));
6918         }
6919         return (0);
6920 }
6921 
6922 /*
6923  * Public interface for looking up a zone by zoneid. It's a customized version
6924  * for netstack_zone_create(). It can only be called from the zsd create
6925  * callbacks, since it doesn't have reference on the zone structure hence if
6926  * it is called elsewhere the zone could disappear after the zonehash_lock
6927  * is dropped.
6928  *
6929  * Furthermore it
6930  * 1. Doesn't check the status of the zone.
6931  * 2. It will be called even before zone_init is called, in that case the
6932  *    address of zone0 is returned directly, and netstack_zone_create()
6933  *    will only assign a value to zone0.zone_netstack, won't break anything.
6934  * 3. Returns without the zone being held.
6935  */
6936 zone_t *
6937 zone_find_by_id_nolock(zoneid_t zoneid)
6938 {
6939         zone_t *zone;
6940 
6941         mutex_enter(&zonehash_lock);
6942         if (zonehashbyid == NULL)
6943                 zone = &zone0;
6944         else
6945                 zone = zone_find_all_by_id(zoneid);
6946         mutex_exit(&zonehash_lock);
6947         return (zone);
6948 }
6949 
6950 /*
6951  * Walk the datalinks for a given zone
6952  */
6953 int
6954 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
6955     void *data)
6956 {
6957         zone_t          *zone;
6958         zone_dl_t       *zdl;
6959         datalink_id_t   *idarray;
6960         uint_t          idcount = 0;
6961         int             i, ret = 0;
6962 
6963         if ((zone = zone_find_by_id(zoneid)) == NULL)
6964                 return (ENOENT);
6965 
6966         /*
6967          * We first build an array of linkid's so that we can walk these and
6968          * execute the callback with the zone_lock dropped.
6969          */
6970         mutex_enter(&zone->zone_lock);
6971         list_for_each(&zone->zone_dl_lists, zdl) {
6972                 idcount++;
6973         }
6974 
6975         if (idcount == 0) {
6976                 mutex_exit(&zone->zone_lock);
6977                 zone_rele(zone);
6978                 return (0);
6979         }
6980 
6981         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
6982         if (idarray == NULL) {
6983                 mutex_exit(&zone->zone_lock);
6984                 zone_rele(zone);
6985                 return (ENOMEM);
6986         }
6987 
6988         i = 0;
6989         list_for_each(&zone->zone_dl_list, zdl) {
6990                 idarray[i] = zdl->zdl_id;
6991                 i++;
6992         }
6993 
6994         mutex_exit(&zone->zone_lock);
6995 
6996         for (i = 0; i < idcount && ret == 0; i++) {
6997                 if ((ret = (*cb)(idarray[i], data)) != 0)
6998                         break;
6999         }
7000 
7001         zone_rele(zone);
7002         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7003         return (ret);
7004 }
7005 
7006 static char *
7007 zone_net_type2name(int type)
7008 {
7009         switch (type) {
7010         case ZONE_NETWORK_ADDRESS:
7011                 return (ZONE_NET_ADDRNAME);
7012         case ZONE_NETWORK_DEFROUTER:
7013                 return (ZONE_NET_RTRNAME);
7014         default:
7015                 return (NULL);
7016         }
7017 }
7018 
7019 static int
7020 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7021 {
7022         zone_t *zone;
7023         zone_dl_t *zdl;
7024         nvlist_t *nvl;
7025         int err = 0;
7026         uint8_t *new = NULL;
7027         char *nvname;
7028         int bufsize;
7029         datalink_id_t linkid = znbuf->zn_linkid;
7030 
7031         if (secpolicy_zone_config(CRED()) != 0)
7032                 return (set_errno(EPERM));
7033 
7034         if (zoneid == GLOBAL_ZONEID)
7035                 return (set_errno(EINVAL));
7036 
7037         nvname = zone_net_type2name(znbuf->zn_type);
7038         bufsize = znbuf->zn_len;
7039         new = znbuf->zn_val;
7040         if (nvname == NULL)
7041                 return (set_errno(EINVAL));
7042 
7043         if ((zone = zone_find_by_id(zoneid)) == NULL) {
7044                 return (set_errno(EINVAL));
7045         }
7046 
7047         mutex_enter(&zone->zone_lock);
7048         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7049                 err = ENXIO;
7050                 goto done;
7051         }
7052         if ((nvl = zdl->zdl_net) == NULL) {
7053                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7054                         err = ENOMEM;
7055                         goto done;
7056                 } else {
7057                         zdl->zdl_net = nvl;
7058                 }
7059         }
7060         if (nvlist_exists(nvl, nvname)) {
7061                 err = EINVAL;
7062                 goto done;
7063         }
7064         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7065         ASSERT(err == 0);
7066 done:
7067         mutex_exit(&zone->zone_lock);
7068         zone_rele(zone);
7069         if (err != 0)
7070                 return (set_errno(err));
7071         else
7072                 return (0);
7073 }
7074 
7075 static int
7076 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7077 {
7078         zone_t *zone;
7079         zone_dl_t *zdl;
7080         nvlist_t *nvl;
7081         uint8_t *ptr;
7082         uint_t psize;
7083         int err = 0;
7084         char *nvname;
7085         int bufsize;
7086         void *buf;
7087         datalink_id_t linkid = znbuf->zn_linkid;
7088 
7089         if (zoneid == GLOBAL_ZONEID)
7090                 return (set_errno(EINVAL));
7091 
7092         nvname = zone_net_type2name(znbuf->zn_type);
7093         bufsize = znbuf->zn_len;
7094         buf = znbuf->zn_val;
7095 
7096         if (nvname == NULL)
7097                 return (set_errno(EINVAL));
7098         if ((zone = zone_find_by_id(zoneid)) == NULL)
7099                 return (set_errno(EINVAL));
7100 
7101         mutex_enter(&zone->zone_lock);
7102         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7103                 err = ENXIO;
7104                 goto done;
7105         }
7106         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7107                 err = ENOENT;
7108                 goto done;
7109         }
7110         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7111         ASSERT(err == 0);
7112 
7113         if (psize > bufsize) {
7114                 err = ENOBUFS;
7115                 goto done;
7116         }
7117         znbuf->zn_len = psize;
7118         bcopy(ptr, buf, psize);
7119 done:
7120         mutex_exit(&zone->zone_lock);
7121         zone_rele(zone);
7122         if (err != 0)
7123                 return (set_errno(err));
7124         else
7125                 return (0);
7126 }