1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
  29  * triggered from a "stub" rnode via a special set of vnodeops.
  30  */
  31 
  32 #include <sys/param.h>
  33 #include <sys/types.h>
  34 #include <sys/systm.h>
  35 #include <sys/cred.h>
  36 #include <sys/time.h>
  37 #include <sys/vnode.h>
  38 #include <sys/vfs.h>
  39 #include <sys/vfs_opreg.h>
  40 #include <sys/file.h>
  41 #include <sys/filio.h>
  42 #include <sys/uio.h>
  43 #include <sys/buf.h>
  44 #include <sys/mman.h>
  45 #include <sys/pathname.h>
  46 #include <sys/dirent.h>
  47 #include <sys/debug.h>
  48 #include <sys/vmsystm.h>
  49 #include <sys/fcntl.h>
  50 #include <sys/flock.h>
  51 #include <sys/swap.h>
  52 #include <sys/errno.h>
  53 #include <sys/strsubr.h>
  54 #include <sys/sysmacros.h>
  55 #include <sys/kmem.h>
  56 #include <sys/mount.h>
  57 #include <sys/cmn_err.h>
  58 #include <sys/pathconf.h>
  59 #include <sys/utsname.h>
  60 #include <sys/dnlc.h>
  61 #include <sys/acl.h>
  62 #include <sys/systeminfo.h>
  63 #include <sys/policy.h>
  64 #include <sys/sdt.h>
  65 #include <sys/list.h>
  66 #include <sys/stat.h>
  67 #include <sys/mntent.h>
  68 #include <sys/priv.h>
  69 
  70 #include <rpc/types.h>
  71 #include <rpc/auth.h>
  72 #include <rpc/clnt.h>
  73 
  74 #include <nfs/nfs.h>
  75 #include <nfs/nfs_clnt.h>
  76 #include <nfs/nfs_acl.h>
  77 #include <nfs/lm.h>
  78 #include <nfs/nfs4.h>
  79 #include <nfs/nfs4_kprot.h>
  80 #include <nfs/rnode4.h>
  81 #include <nfs/nfs4_clnt.h>
  82 #include <nfs/nfsid_map.h>
  83 #include <nfs/nfs4_idmap_impl.h>
  84 
  85 #include <vm/hat.h>
  86 #include <vm/as.h>
  87 #include <vm/page.h>
  88 #include <vm/pvn.h>
  89 #include <vm/seg.h>
  90 #include <vm/seg_map.h>
  91 #include <vm/seg_kpm.h>
  92 #include <vm/seg_vn.h>
  93 
  94 #include <fs/fs_subr.h>
  95 
  96 #include <sys/ddi.h>
  97 #include <sys/int_fmtio.h>
  98 
  99 #include <sys/sunddi.h>
 100 
 101 #include <sys/priv_names.h>
 102 
 103 extern zone_key_t       nfs4clnt_zone_key;
 104 extern zone_key_t       nfsidmap_zone_key;
 105 
 106 /*
 107  * The automatic unmounter thread stuff!
 108  */
 109 static int nfs4_trigger_thread_timer = 20;      /* in seconds */
 110 
 111 /*
 112  * Just a default....
 113  */
 114 static uint_t nfs4_trigger_mount_to = 240;
 115 
 116 typedef struct nfs4_trigger_globals {
 117         kmutex_t                ntg_forest_lock;
 118         uint_t                  ntg_mount_to;
 119         int                     ntg_thread_started;
 120         nfs4_ephemeral_tree_t   *ntg_forest;
 121 } nfs4_trigger_globals_t;
 122 
 123 kmutex_t        nfs4_ephemeral_thread_lock;
 124 
 125 zone_key_t      nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
 126 
 127 static void     nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
 128 
 129 /*
 130  * Used for ephemeral mounts; contains data either duplicated from
 131  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
 132  *
 133  * It's intended that this structure is used solely for ephemeral
 134  * mount-type specific data, for passing this data to
 135  * nfs4_trigger_nargs_create().
 136  */
 137 typedef struct ephemeral_servinfo {
 138         char                    *esi_hostname;
 139         char                    *esi_netname;
 140         char                    *esi_path;
 141         int                     esi_path_len;
 142         int                     esi_mount_flags;
 143         struct netbuf           *esi_addr;
 144         struct netbuf           *esi_syncaddr;
 145         struct knetconfig       *esi_knconf;
 146 } ephemeral_servinfo_t;
 147 
 148 /*
 149  * Collect together the mount-type specific and generic data args.
 150  */
 151 typedef struct domount_args {
 152         ephemeral_servinfo_t    *dma_esi;
 153         char                    *dma_hostlist; /* comma-sep. for RO failover */
 154         struct nfs_args         *dma_nargs;
 155 } domount_args_t;
 156 
 157 
 158 /*
 159  * The vnode ops functions for a trigger stub vnode
 160  */
 161 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
 162 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
 163     caller_context_t *);
 164 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
 165     caller_context_t *);
 166 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
 167     caller_context_t *);
 168 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
 169     caller_context_t *);
 170 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
 171     struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
 172     int *, pathname_t *);
 173 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
 174     enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
 175     vsecattr_t *);
 176 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 177     int);
 178 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
 179     caller_context_t *, int);
 180 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
 181     cred_t *, caller_context_t *, int);
 182 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
 183     vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
 184 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 185     caller_context_t *, int);
 186 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
 187     cred_t *, caller_context_t *, int);
 188 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
 189 
 190 /*
 191  * Regular NFSv4 vnodeops that we need to reference directly
 192  */
 193 extern int      nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
 194                     caller_context_t *);
 195 extern void     nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
 196 extern int      nfs4_rwlock(vnode_t *, int, caller_context_t *);
 197 extern void     nfs4_rwunlock(vnode_t *, int, caller_context_t *);
 198 extern int      nfs4_lookup(vnode_t *, char *, vnode_t **,
 199                     struct pathname *, int, vnode_t *, cred_t *,
 200                     caller_context_t *, int *, pathname_t *);
 201 extern int      nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 202                     caller_context_t *);
 203 extern int      nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 204                     caller_context_t *);
 205 extern int      nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
 206 extern int      nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
 207 
 208 static int      nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
 209 static int      nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
 210     cred_t *, vnode_t **);
 211 static int      nfs4_trigger_domount_args_create(vnode_t *, cred_t *,
 212     domount_args_t **dmap);
 213 static void     nfs4_trigger_domount_args_destroy(domount_args_t *dma,
 214     vnode_t *vp);
 215 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *,
 216     cred_t *);
 217 static void     nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
 218 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
 219     servinfo4_t *);
 220 static ephemeral_servinfo_t *nfs4_trigger_esi_create_referral(vnode_t *,
 221     cred_t *);
 222 static struct nfs_args  *nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
 223     ephemeral_servinfo_t *);
 224 static void     nfs4_trigger_nargs_destroy(struct nfs_args *);
 225 static char     *nfs4_trigger_create_mntopts(vfs_t *);
 226 static void     nfs4_trigger_destroy_mntopts(char *);
 227 static int      nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
 228 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
 229 static enum clnt_stat nfs4_ping_server_common(struct knetconfig *,
 230     struct netbuf *, int);
 231 
 232 extern int      umount2_engine(vfs_t *, int, cred_t *, int);
 233 
 234 vnodeops_t *nfs4_trigger_vnodeops;
 235 
 236 /*
 237  * These are the vnodeops that we must define for stub vnodes.
 238  *
 239  *
 240  * Many of the VOPs defined for NFSv4 do not need to be defined here,
 241  * for various reasons. This will result in the VFS default function being
 242  * used:
 243  *
 244  * - These VOPs require a previous VOP_OPEN to have occurred. That will have
 245  *   lost the reference to the stub vnode, meaning these should not be called:
 246  *       close, read, write, ioctl, readdir, seek.
 247  *
 248  * - These VOPs are meaningless for vnodes without data pages. Since the
 249  *   stub vnode is of type VDIR, these should not be called:
 250  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
 251  *
 252  * - These VOPs are otherwise not applicable, and should not be called:
 253  *       dump, setsecattr.
 254  *
 255  *
 256  * These VOPs we do not want to define, but nor do we want the VFS default
 257  * action. Instead, we specify the VFS error function, with fs_error(), but
 258  * note that fs_error() is not actually called. Instead it results in the
 259  * use of the error function defined for the particular VOP, in vn_ops_table[]:
 260  *
 261  * -   frlock, dispose, shrlock.
 262  *
 263  *
 264  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
 265  * NOTE: if any of these ops involve an OTW call with the stub FH, then
 266  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
 267  * to protect the security data in the servinfo4_t for the "parent"
 268  * filesystem that contains the stub.
 269  *
 270  * - These VOPs should not trigger a mount, so that "ls -l" does not:
 271  *       pathconf, getsecattr.
 272  *
 273  * - These VOPs would not make sense to trigger:
 274  *       inactive, rwlock, rwunlock, fid, realvp.
 275  */
 276 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
 277         VOPNAME_OPEN,           { .vop_open = nfs4_trigger_open },
 278         VOPNAME_GETATTR,        { .vop_getattr = nfs4_trigger_getattr },
 279         VOPNAME_SETATTR,        { .vop_setattr = nfs4_trigger_setattr },
 280         VOPNAME_ACCESS,         { .vop_access = nfs4_trigger_access },
 281         VOPNAME_LOOKUP,         { .vop_lookup = nfs4_trigger_lookup },
 282         VOPNAME_CREATE,         { .vop_create = nfs4_trigger_create },
 283         VOPNAME_REMOVE,         { .vop_remove = nfs4_trigger_remove },
 284         VOPNAME_LINK,           { .vop_link = nfs4_trigger_link },
 285         VOPNAME_RENAME,         { .vop_rename = nfs4_trigger_rename },
 286         VOPNAME_MKDIR,          { .vop_mkdir = nfs4_trigger_mkdir },
 287         VOPNAME_RMDIR,          { .vop_rmdir = nfs4_trigger_rmdir },
 288         VOPNAME_SYMLINK,        { .vop_symlink = nfs4_trigger_symlink },
 289         VOPNAME_READLINK,       { .vop_readlink = nfs4_trigger_readlink },
 290         VOPNAME_INACTIVE,       { .vop_inactive = nfs4_inactive },
 291         VOPNAME_FID,            { .vop_fid = nfs4_fid },
 292         VOPNAME_RWLOCK,         { .vop_rwlock = nfs4_rwlock },
 293         VOPNAME_RWUNLOCK,       { .vop_rwunlock = nfs4_rwunlock },
 294         VOPNAME_REALVP,         { .vop_realvp = nfs4_realvp },
 295         VOPNAME_GETSECATTR,     { .vop_getsecattr = nfs4_getsecattr },
 296         VOPNAME_PATHCONF,       { .vop_pathconf = nfs4_pathconf },
 297         VOPNAME_FRLOCK,         { .error = fs_error },
 298         VOPNAME_DISPOSE,        { .error = fs_error },
 299         VOPNAME_SHRLOCK,        { .error = fs_error },
 300         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 301         NULL, NULL
 302 };
 303 
 304 static void
 305 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
 306 {
 307         ASSERT(mutex_owned(&net->net_cnt_lock));
 308         net->net_refcnt++;
 309         ASSERT(net->net_refcnt != 0);
 310 }
 311 
 312 static void
 313 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
 314 {
 315         mutex_enter(&net->net_cnt_lock);
 316         nfs4_ephemeral_tree_incr(net);
 317         mutex_exit(&net->net_cnt_lock);
 318 }
 319 
 320 /*
 321  * We need a safe way to decrement the refcnt whilst the
 322  * lock is being held.
 323  */
 324 static void
 325 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
 326 {
 327         ASSERT(mutex_owned(&net->net_cnt_lock));
 328         ASSERT(net->net_refcnt != 0);
 329         net->net_refcnt--;
 330 }
 331 
 332 static void
 333 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
 334 {
 335         mutex_enter(&net->net_cnt_lock);
 336         nfs4_ephemeral_tree_decr(net);
 337         mutex_exit(&net->net_cnt_lock);
 338 }
 339 
 340 /*
 341  * Trigger ops for stub vnodes; for mirror mounts, etc.
 342  *
 343  * The general idea is that a "triggering" op will first call
 344  * nfs4_trigger_mount(), which will find out whether a mount has already
 345  * been triggered.
 346  *
 347  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
 348  * of the covering vfs.
 349  *
 350  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
 351  * and again set newvp, as above.
 352  *
 353  * The triggering op may then re-issue the VOP by calling it on newvp.
 354  *
 355  * Note that some ops may perform custom action, and may or may not need
 356  * to trigger a mount.
 357  *
 358  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
 359  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
 360  * and that would just recurse. Instead, we call the v4 op directly,
 361  * by name.  This is OK, since we know that the vnode is for NFSv4,
 362  * otherwise it couldn't be a stub.
 363  *
 364  */
 365 
 366 static int
 367 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 368 {
 369         int error;
 370         vnode_t *newvp;
 371 
 372         error = nfs4_trigger_mount(*vpp, cr, &newvp);
 373         if (error)
 374                 return (error);
 375 
 376         /* Release the stub vnode, as we're losing the reference to it */
 377         VN_RELE(*vpp);
 378 
 379         /* Give the caller the root vnode of the newly-mounted fs */
 380         *vpp = newvp;
 381 
 382         /* return with VN_HELD(newvp) */
 383         return (VOP_OPEN(vpp, flag, cr, ct));
 384 }
 385 
 386 void
 387 nfs4_fake_attrs(vnode_t *vp, struct vattr *vap)
 388 {
 389         uint_t mask;
 390         timespec_t now;
 391 
 392         /*
 393          * Set some attributes here for referrals.
 394          */
 395         mask = vap->va_mask;
 396         bzero(vap, sizeof (struct vattr));
 397         vap->va_mask = mask;
 398         vap->va_uid  = 0;
 399         vap->va_gid  = 0;
 400         vap->va_nlink        = 1;
 401         vap->va_size = 1;
 402         gethrestime(&now);
 403         vap->va_atime        = now;
 404         vap->va_mtime        = now;
 405         vap->va_ctime        = now;
 406         vap->va_type = VDIR;
 407         vap->va_mode = 0555;
 408         vap->va_fsid = vp->v_vfsp->vfs_dev;
 409         vap->va_rdev = 0;
 410         vap->va_blksize      = MAXBSIZE;
 411         vap->va_nblocks      = 1;
 412         vap->va_seq  = 0;
 413 }
 414 
 415 /*
 416  * For the majority of cases, nfs4_trigger_getattr() will not trigger
 417  * a mount. However, if ATTR_TRIGGER is set, we are being informed
 418  * that we need to force the mount before we attempt to determine
 419  * the attributes. The intent is an atomic operation for security
 420  * testing.
 421  *
 422  * If we're not triggering a mount, we can still inquire about the
 423  * actual attributes from the server in the mirror mount case,
 424  * and will return manufactured attributes for a referral (see
 425  * the 'create' branch of find_referral_stubvp()).
 426  */
 427 static int
 428 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
 429     caller_context_t *ct)
 430 {
 431         int error;
 432 
 433         if (flags & ATTR_TRIGGER) {
 434                 vnode_t *newvp;
 435 
 436                 error = nfs4_trigger_mount(vp, cr, &newvp);
 437                 if (error)
 438                         return (error);
 439 
 440                 error = VOP_GETATTR(newvp, vap, flags, cr, ct);
 441                 VN_RELE(newvp);
 442 
 443         } else if (RP_ISSTUB_MIRRORMOUNT(VTOR4(vp))) {
 444 
 445                 error = nfs4_getattr(vp, vap, flags, cr, ct);
 446 
 447         } else if (RP_ISSTUB_REFERRAL(VTOR4(vp))) {
 448 
 449                 nfs4_fake_attrs(vp, vap);
 450                 error = 0;
 451         }
 452 
 453         return (error);
 454 }
 455 
 456 static int
 457 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
 458                 caller_context_t *ct)
 459 {
 460         int error;
 461         vnode_t *newvp;
 462 
 463         error = nfs4_trigger_mount(vp, cr, &newvp);
 464         if (error)
 465                 return (error);
 466 
 467         error = VOP_SETATTR(newvp, vap, flags, cr, ct);
 468         VN_RELE(newvp);
 469 
 470         return (error);
 471 }
 472 
 473 static int
 474 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
 475     caller_context_t *ct)
 476 {
 477         int error;
 478         vnode_t *newvp;
 479 
 480         error = nfs4_trigger_mount(vp, cr, &newvp);
 481         if (error)
 482                 return (error);
 483 
 484         error = VOP_ACCESS(newvp, mode, flags, cr, ct);
 485         VN_RELE(newvp);
 486 
 487         return (error);
 488 }
 489 
 490 static int
 491 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
 492     struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
 493     caller_context_t *ct, int *deflags, pathname_t *rpnp)
 494 {
 495         int error;
 496         vnode_t *newdvp;
 497         rnode4_t *drp = VTOR4(dvp);
 498 
 499         ASSERT(RP_ISSTUB(drp));
 500 
 501         /*
 502          * It's not legal to lookup ".." for an fs root, so we mustn't pass
 503          * that up. Instead, pass onto the regular op, regardless of whether
 504          * we've triggered a mount.
 505          */
 506         if (strcmp(nm, "..") == 0)
 507                 if (RP_ISSTUB_MIRRORMOUNT(drp)) {
 508                         return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
 509                             ct, deflags, rpnp));
 510                 } else if (RP_ISSTUB_REFERRAL(drp)) {
 511                         /* Return the parent vnode */
 512                         return (vtodv(dvp, vpp, cr, TRUE));
 513                 }
 514 
 515         error = nfs4_trigger_mount(dvp, cr, &newdvp);
 516         if (error)
 517                 return (error);
 518 
 519         error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
 520             deflags, rpnp);
 521         VN_RELE(newdvp);
 522 
 523         return (error);
 524 }
 525 
 526 static int
 527 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
 528     enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
 529     int flags, caller_context_t *ct, vsecattr_t *vsecp)
 530 {
 531         int error;
 532         vnode_t *newdvp;
 533 
 534         error = nfs4_trigger_mount(dvp, cr, &newdvp);
 535         if (error)
 536                 return (error);
 537 
 538         error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
 539             flags, ct, vsecp);
 540         VN_RELE(newdvp);
 541 
 542         return (error);
 543 }
 544 
 545 static int
 546 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
 547     int flags)
 548 {
 549         int error;
 550         vnode_t *newdvp;
 551 
 552         error = nfs4_trigger_mount(dvp, cr, &newdvp);
 553         if (error)
 554                 return (error);
 555 
 556         error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
 557         VN_RELE(newdvp);
 558 
 559         return (error);
 560 }
 561 
 562 static int
 563 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
 564     caller_context_t *ct, int flags)
 565 {
 566         int error;
 567         vnode_t *newtdvp;
 568 
 569         error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
 570         if (error)
 571                 return (error);
 572 
 573         /*
 574          * We don't check whether svp is a stub. Let the NFSv4 code
 575          * detect that error, and return accordingly.
 576          */
 577         error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
 578         VN_RELE(newtdvp);
 579 
 580         return (error);
 581 }
 582 
 583 static int
 584 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
 585     cred_t *cr, caller_context_t *ct, int flags)
 586 {
 587         int error;
 588         vnode_t *newsdvp;
 589         rnode4_t *tdrp = VTOR4(tdvp);
 590 
 591         /*
 592          * We know that sdvp is a stub, otherwise we would not be here.
 593          *
 594          * If tdvp is also be a stub, there are two possibilities: it
 595          * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
 596          * or it is a different stub [!VN_CMP(sdvp, tdvp)].
 597          *
 598          * In the former case, just trigger sdvp, and treat tdvp as
 599          * though it were not a stub.
 600          *
 601          * In the latter case, it might be a different stub for the
 602          * same server fs as sdvp, or for a different server fs.
 603          * Regardless, from the client perspective this would still
 604          * be a cross-filesystem rename, and should not be allowed,
 605          * so return EXDEV, without triggering either mount.
 606          */
 607         if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
 608                 return (EXDEV);
 609 
 610         error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
 611         if (error)
 612                 return (error);
 613 
 614         error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
 615 
 616         VN_RELE(newsdvp);
 617 
 618         return (error);
 619 }
 620 
 621 /* ARGSUSED */
 622 static int
 623 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
 624     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
 625 {
 626         int error;
 627         vnode_t *newdvp;
 628 
 629         error = nfs4_trigger_mount(dvp, cr, &newdvp);
 630         if (error)
 631                 return (error);
 632 
 633         error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
 634         VN_RELE(newdvp);
 635 
 636         return (error);
 637 }
 638 
 639 static int
 640 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
 641     caller_context_t *ct, int flags)
 642 {
 643         int error;
 644         vnode_t *newdvp;
 645 
 646         error = nfs4_trigger_mount(dvp, cr, &newdvp);
 647         if (error)
 648                 return (error);
 649 
 650         error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
 651         VN_RELE(newdvp);
 652 
 653         return (error);
 654 }
 655 
 656 static int
 657 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
 658     cred_t *cr, caller_context_t *ct, int flags)
 659 {
 660         int error;
 661         vnode_t *newdvp;
 662 
 663         error = nfs4_trigger_mount(dvp, cr, &newdvp);
 664         if (error)
 665                 return (error);
 666 
 667         error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
 668         VN_RELE(newdvp);
 669 
 670         return (error);
 671 }
 672 
 673 static int
 674 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
 675     caller_context_t *ct)
 676 {
 677         int error;
 678         vnode_t *newvp;
 679 
 680         error = nfs4_trigger_mount(vp, cr, &newvp);
 681         if (error)
 682                 return (error);
 683 
 684         error = VOP_READLINK(newvp, uiop, cr, ct);
 685         VN_RELE(newvp);
 686 
 687         return (error);
 688 }
 689 
 690 /* end of trigger vnode ops */
 691 
 692 /*
 693  * See if the mount has already been done by another caller.
 694  */
 695 static int
 696 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
 697     bool_t *was_mounted, vfs_t **vfsp)
 698 {
 699         int             error;
 700         mntinfo4_t      *mi = VTOMI4(vp);
 701 
 702         *was_mounted = FALSE;
 703 
 704         error = vn_vfsrlock_wait(vp);
 705         if (error)
 706                 return (error);
 707 
 708         *vfsp = vn_mountedvfs(vp);
 709         if (*vfsp != NULL) {
 710                 /* the mount has already occurred */
 711                 error = VFS_ROOT(*vfsp, newvpp);
 712                 if (!error) {
 713                         /* need to update the reference time  */
 714                         mutex_enter(&mi->mi_lock);
 715                         if (mi->mi_ephemeral)
 716                                 mi->mi_ephemeral->ne_ref_time =
 717                                     gethrestime_sec();
 718                         mutex_exit(&mi->mi_lock);
 719 
 720                         *was_mounted = TRUE;
 721                 }
 722         }
 723 
 724         vn_vfsunlock(vp);
 725         return (0);
 726 }
 727 
 728 /*
 729  * Mount upon a trigger vnode; for mirror-mounts, referrals, etc.
 730  *
 731  * The mount may have already occurred, via another thread. If not,
 732  * assemble the location information - which may require fetching - and
 733  * perform the mount.
 734  *
 735  * Sets newvp to be the root of the fs that is now covering vp. Note
 736  * that we return with VN_HELD(*newvp).
 737  *
 738  * The caller is responsible for passing the VOP onto the covering fs.
 739  */
 740 static int
 741 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
 742 {
 743         int                      error;
 744         vfs_t                   *vfsp;
 745         rnode4_t                *rp = VTOR4(vp);
 746         mntinfo4_t              *mi = VTOMI4(vp);
 747         domount_args_t          *dma;
 748 
 749         nfs4_ephemeral_tree_t   *net;
 750 
 751         bool_t                  must_unlock = FALSE;
 752         bool_t                  is_building = FALSE;
 753         bool_t                  was_mounted = FALSE;
 754 
 755         cred_t                  *mcred = NULL;
 756 
 757         nfs4_trigger_globals_t  *ntg;
 758 
 759         zone_t                  *zone = curproc->p_zone;
 760 
 761         ASSERT(RP_ISSTUB(rp));
 762 
 763         *newvpp = NULL;
 764 
 765         /*
 766          * Has the mount already occurred?
 767          */
 768         error = nfs4_trigger_mounted_already(vp, newvpp,
 769             &was_mounted, &vfsp);
 770         if (error || was_mounted)
 771                 goto done;
 772 
 773         ntg = zone_getspecific(nfs4_ephemeral_key, zone);
 774         ASSERT(ntg != NULL);
 775 
 776         mutex_enter(&mi->mi_lock);
 777 
 778         /*
 779          * We need to lock down the ephemeral tree.
 780          */
 781         if (mi->mi_ephemeral_tree == NULL) {
 782                 net = kmem_zalloc(sizeof (*net), KM_SLEEP);
 783                 mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 784                 mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
 785                 net->net_refcnt = 1;
 786                 net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
 787                 is_building = TRUE;
 788 
 789                 /*
 790                  * We need to add it to the zone specific list for
 791                  * automatic unmounting and harvesting of deadwood.
 792                  */
 793                 mutex_enter(&ntg->ntg_forest_lock);
 794                 if (ntg->ntg_forest != NULL)
 795                         net->net_next = ntg->ntg_forest;
 796                 ntg->ntg_forest = net;
 797                 mutex_exit(&ntg->ntg_forest_lock);
 798 
 799                 /*
 800                  * No lock order confusion with mi_lock because no
 801                  * other node could have grabbed net_tree_lock.
 802                  */
 803                 mutex_enter(&net->net_tree_lock);
 804                 mi->mi_ephemeral_tree = net;
 805                 net->net_mount = mi;
 806                 mutex_exit(&mi->mi_lock);
 807 
 808                 MI4_HOLD(mi);
 809                 VFS_HOLD(mi->mi_vfsp);
 810         } else {
 811                 net = mi->mi_ephemeral_tree;
 812                 nfs4_ephemeral_tree_hold(net);
 813 
 814                 mutex_exit(&mi->mi_lock);
 815 
 816                 mutex_enter(&net->net_tree_lock);
 817 
 818                 /*
 819                  * We can only procede if the tree is neither locked
 820                  * nor being torn down.
 821                  */
 822                 mutex_enter(&net->net_cnt_lock);
 823                 if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
 824                         nfs4_ephemeral_tree_decr(net);
 825                         mutex_exit(&net->net_cnt_lock);
 826                         mutex_exit(&net->net_tree_lock);
 827 
 828                         return (EIO);
 829                 }
 830                 mutex_exit(&net->net_cnt_lock);
 831         }
 832 
 833         mutex_enter(&net->net_cnt_lock);
 834         net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
 835         mutex_exit(&net->net_cnt_lock);
 836 
 837         must_unlock = TRUE;
 838 
 839         error = nfs4_trigger_domount_args_create(vp, cr, &dma);
 840         if (error)
 841                 goto done;
 842 
 843         /*
 844          * Note that since we define mirror mounts to work
 845          * for any user, we simply extend the privileges of
 846          * the user's credentials to allow the mount to
 847          * proceed.
 848          */
 849         mcred = crdup(cr);
 850         if (mcred == NULL) {
 851                 error = EINVAL;
 852                 nfs4_trigger_domount_args_destroy(dma, vp);
 853                 goto done;
 854         }
 855 
 856         crset_zone_privall(mcred);
 857         if (is_system_labeled())
 858                 (void) setpflags(NET_MAC_AWARE, 1, mcred);
 859 
 860         error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
 861         nfs4_trigger_domount_args_destroy(dma, vp);
 862 
 863         DTRACE_PROBE2(nfs4clnt__func__referral__mount,
 864             vnode_t *, vp, int, error);
 865 
 866         crfree(mcred);
 867 
 868 done:
 869 
 870         if (must_unlock) {
 871                 mutex_enter(&net->net_cnt_lock);
 872                 net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
 873 
 874                 /*
 875                  * REFCNT: If we are the root of the tree, then we need
 876                  * to keep a reference because we malloced the tree and
 877                  * this is where we tied it to our mntinfo.
 878                  *
 879                  * If we are not the root of the tree, then our tie to
 880                  * the mntinfo occured elsewhere and we need to
 881                  * decrement the reference to the tree.
 882                  */
 883                 if (is_building)
 884                         net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
 885                 else
 886                         nfs4_ephemeral_tree_decr(net);
 887                 mutex_exit(&net->net_cnt_lock);
 888 
 889                 mutex_exit(&net->net_tree_lock);
 890         }
 891 
 892         if (!error && (newvpp == NULL || *newvpp == NULL))
 893                 error = ENOSYS;
 894 
 895         return (error);
 896 }
 897 
 898 /*
 899  * Collect together both the generic & mount-type specific args.
 900  */
 901 static int
 902 nfs4_trigger_domount_args_create(vnode_t *vp, cred_t *cr, domount_args_t **dmap)
 903 {
 904         int nointr;
 905         char *hostlist;
 906         servinfo4_t *svp;
 907         struct nfs_args *nargs, *nargs_head;
 908         enum clnt_stat status;
 909         ephemeral_servinfo_t *esi, *esi_first;
 910         domount_args_t *dma;
 911         mntinfo4_t *mi = VTOMI4(vp);
 912 
 913         nointr = !(mi->mi_flags & MI4_INT);
 914         hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 915 
 916         svp = mi->mi_curr_serv;
 917         /* check if the current server is responding */
 918         status = nfs4_trigger_ping_server(svp, nointr);
 919         if (status == RPC_SUCCESS) {
 920                 esi_first = nfs4_trigger_esi_create(vp, svp, cr);
 921                 if (esi_first == NULL) {
 922                         kmem_free(hostlist, MAXPATHLEN);
 923                         return (EINVAL);
 924                 }
 925 
 926                 (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
 927 
 928                 nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
 929         } else {
 930                 /* current server did not respond */
 931                 esi_first = NULL;
 932                 nargs_head = NULL;
 933         }
 934         nargs = nargs_head;
 935 
 936         /*
 937          * NFS RO failover.
 938          *
 939          * If we have multiple servinfo4 structures, linked via sv_next,
 940          * we must create one nfs_args for each, linking the nfs_args via
 941          * nfs_ext_u.nfs_extB.next.
 942          *
 943          * We need to build a corresponding esi for each, too, but that is
 944          * used solely for building nfs_args, and may be immediately
 945          * discarded, as domount() requires the info from just one esi,
 946          * but all the nfs_args.
 947          *
 948          * Currently, the NFS mount code will hang if not all servers
 949          * requested are available. To avoid that, we need to ping each
 950          * server, here, and remove it from the list if it is not
 951          * responding. This has the side-effect of that server then
 952          * being permanently unavailable for this failover mount, even if
 953          * it recovers. That's unfortunate, but the best we can do until
 954          * the mount code path is fixed.
 955          */
 956 
 957         /*
 958          * If the current server was down, loop indefinitely until we find
 959          * at least one responsive server.
 960          */
 961         do {
 962                 /* no locking needed for sv_next; it is only set at fs mount */
 963                 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
 964                         struct nfs_args *next;
 965 
 966                         /*
 967                          * nargs_head: the head of the nfs_args list
 968                          * nargs: the current tail of the list
 969                          * next: the newly-created element to be added
 970                          */
 971 
 972                         /*
 973                          * We've already tried the current server, above;
 974                          * if it was responding, we have already included it
 975                          * and it may now be ignored.
 976                          *
 977                          * Otherwise, try it again, since it may now have
 978                          * recovered.
 979                          */
 980                         if (svp == mi->mi_curr_serv && esi_first != NULL)
 981                                 continue;
 982 
 983                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
 984                         if (svp->sv_flags & SV4_NOTINUSE) {
 985                                 nfs_rw_exit(&svp->sv_lock);
 986                                 continue;
 987                         }
 988                         nfs_rw_exit(&svp->sv_lock);
 989 
 990                         /* check if the server is responding */
 991                         status = nfs4_trigger_ping_server(svp, nointr);
 992                         if (status == RPC_INTR) {
 993                                 kmem_free(hostlist, MAXPATHLEN);
 994                                 nfs4_trigger_esi_destroy(esi_first, vp);
 995                                 nargs = nargs_head;
 996                                 while (nargs != NULL) {
 997                                         next = nargs->nfs_ext_u.nfs_extB.next;
 998                                         nfs4_trigger_nargs_destroy(nargs);
 999                                         nargs = next;
1000                                 }
1001                                 return (EINTR);
1002                         } else if (status != RPC_SUCCESS) {
1003                                 /* if the server did not respond, ignore it */
1004                                 continue;
1005                         }
1006 
1007                         esi = nfs4_trigger_esi_create(vp, svp, cr);
1008                         if (esi == NULL)
1009                                 continue;
1010 
1011                         /*
1012                          * If the original current server (mi_curr_serv)
1013                          * was down when when we first tried it,
1014                          * (i.e. esi_first == NULL),
1015                          * we select this new server (svp) to be the server
1016                          * that we will actually contact (esi_first).
1017                          *
1018                          * Note that it's possible that mi_curr_serv == svp,
1019                          * if that mi_curr_serv was down but has now recovered.
1020                          */
1021                         next = nfs4_trigger_nargs_create(mi, svp, esi);
1022                         if (esi_first == NULL) {
1023                                 ASSERT(nargs == NULL);
1024                                 ASSERT(nargs_head == NULL);
1025                                 nargs_head = next;
1026                                 esi_first = esi;
1027                                 (void) strlcpy(hostlist,
1028                                     esi_first->esi_hostname, MAXPATHLEN);
1029                         } else {
1030                                 ASSERT(nargs_head != NULL);
1031                                 nargs->nfs_ext_u.nfs_extB.next = next;
1032                                 (void) strlcat(hostlist, ",", MAXPATHLEN);
1033                                 (void) strlcat(hostlist, esi->esi_hostname,
1034                                     MAXPATHLEN);
1035                                 /* esi was only needed for hostname & nargs */
1036                                 nfs4_trigger_esi_destroy(esi, vp);
1037                         }
1038 
1039                         nargs = next;
1040                 }
1041 
1042                 /* if we've had no response at all, wait a second */
1043                 if (esi_first == NULL)
1044                         delay(drv_usectohz(1000000));
1045 
1046         } while (esi_first == NULL);
1047         ASSERT(nargs_head != NULL);
1048 
1049         dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
1050         dma->dma_esi = esi_first;
1051         dma->dma_hostlist = hostlist;
1052         dma->dma_nargs = nargs_head;
1053         *dmap = dma;
1054 
1055         return (0);
1056 }
1057 
1058 static void
1059 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
1060 {
1061         if (dma != NULL) {
1062                 if (dma->dma_esi != NULL && vp != NULL)
1063                         nfs4_trigger_esi_destroy(dma->dma_esi, vp);
1064 
1065                 if (dma->dma_hostlist != NULL)
1066                         kmem_free(dma->dma_hostlist, MAXPATHLEN);
1067 
1068                 if (dma->dma_nargs != NULL) {
1069                         struct nfs_args *nargs = dma->dma_nargs;
1070 
1071                         do {
1072                                 struct nfs_args *next =
1073                                     nargs->nfs_ext_u.nfs_extB.next;
1074 
1075                                 nfs4_trigger_nargs_destroy(nargs);
1076                                 nargs = next;
1077                         } while (nargs != NULL);
1078                 }
1079 
1080                 kmem_free(dma, sizeof (domount_args_t));
1081         }
1082 }
1083 
1084 /*
1085  * The ephemeral_servinfo_t struct contains basic information we will need to
1086  * perform the mount. Whilst the structure is generic across different
1087  * types of ephemeral mount, the way we gather its contents differs.
1088  */
1089 static ephemeral_servinfo_t *
1090 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp, cred_t *cr)
1091 {
1092         ephemeral_servinfo_t *esi;
1093         rnode4_t *rp = VTOR4(vp);
1094 
1095         ASSERT(RP_ISSTUB(rp));
1096 
1097         /* Call the ephemeral type-specific routine */
1098         if (RP_ISSTUB_MIRRORMOUNT(rp))
1099                 esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
1100         else if (RP_ISSTUB_REFERRAL(rp))
1101                 esi = nfs4_trigger_esi_create_referral(vp, cr);
1102         else
1103                 esi = NULL;
1104         return (esi);
1105 }
1106 
1107 static void
1108 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
1109 {
1110         rnode4_t *rp = VTOR4(vp);
1111 
1112         ASSERT(RP_ISSTUB(rp));
1113 
1114         /* Currently, no need for an ephemeral type-specific routine */
1115 
1116         /*
1117          * The contents of ephemeral_servinfo_t goes into nfs_args,
1118          * and will be handled by nfs4_trigger_nargs_destroy().
1119          * We need only free the structure itself.
1120          */
1121         if (esi != NULL)
1122                 kmem_free(esi, sizeof (ephemeral_servinfo_t));
1123 }
1124 
1125 /*
1126  * Some of this may turn out to be common with other ephemeral types,
1127  * in which case it should be moved to nfs4_trigger_esi_create(), or a
1128  * common function called.
1129  */
1130 
1131 /*
1132  * Mirror mounts case - should have all data available
1133  */
1134 static ephemeral_servinfo_t *
1135 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
1136 {
1137         char                    *stubpath;
1138         struct knetconfig       *sikncp, *svkncp;
1139         struct netbuf           *bufp;
1140         ephemeral_servinfo_t    *esi;
1141 
1142         esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1143 
1144         /* initially set to be our type of ephemeral mount; may be added to */
1145         esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1146 
1147         /*
1148          * We're copying info from the stub rnode's servinfo4, but
1149          * we must create new copies, not pointers, since this information
1150          * is to be associated with the new mount, which will be
1151          * unmounted (and its structures freed) separately
1152          */
1153 
1154         /*
1155          * Sizes passed to kmem_[z]alloc here must match those freed
1156          * in nfs4_free_args()
1157          */
1158 
1159         /*
1160          * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1161          * is difficult to avoid: as we need to read svp to calculate the
1162          * sizes to be allocated.
1163          */
1164         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1165 
1166         esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1167         (void) strcat(esi->esi_hostname, svp->sv_hostname);
1168 
1169         esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1170         bufp = esi->esi_addr;
1171         bufp->len = svp->sv_addr.len;
1172         bufp->maxlen = svp->sv_addr.maxlen;
1173         bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1174         bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1175 
1176         esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1177         sikncp = esi->esi_knconf;
1178         svkncp = svp->sv_knconf;
1179         sikncp->knc_semantics = svkncp->knc_semantics;
1180         sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1181         (void) strcat((char *)sikncp->knc_protofmly,
1182             (char *)svkncp->knc_protofmly);
1183         sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1184         (void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1185         sikncp->knc_rdev = svkncp->knc_rdev;
1186 
1187         /*
1188          * Used when AUTH_DH is negotiated.
1189          *
1190          * This is ephemeral mount-type specific, since it contains the
1191          * server's time-sync syncaddr.
1192          */
1193         if (svp->sv_dhsec) {
1194                 struct netbuf *bufp;
1195                 sec_data_t *sdata;
1196                 dh_k4_clntdata_t *data;
1197 
1198                 sdata = svp->sv_dhsec;
1199                 data = (dh_k4_clntdata_t *)sdata->data;
1200                 ASSERT(sdata->rpcflavor == AUTH_DH);
1201 
1202                 bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1203                 bufp->len = data->syncaddr.len;
1204                 bufp->maxlen = data->syncaddr.maxlen;
1205                 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1206                 bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1207                 esi->esi_syncaddr = bufp;
1208 
1209                 if (data->netname != NULL) {
1210                         int nmlen = data->netnamelen;
1211 
1212                         /*
1213                          * We need to copy from a dh_k4_clntdata_t
1214                          * netname/netnamelen pair to a NUL-terminated
1215                          * netname string suitable for putting in nfs_args,
1216                          * where the latter has no netnamelen field.
1217                          */
1218                         esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1219                         bcopy(data->netname, esi->esi_netname, nmlen);
1220                 }
1221         } else {
1222                 esi->esi_syncaddr = NULL;
1223                 esi->esi_netname = NULL;
1224         }
1225 
1226         stubpath = fn_path(VTOSV(vp)->sv_name);
1227         /* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1228         ASSERT(*stubpath == '.');
1229         stubpath += 1;
1230 
1231         /* for nfs_args->fh */
1232         esi->esi_path_len = strlen(stubpath) + 1;
1233         if (strcmp(svp->sv_path, "/") != 0)
1234                 esi->esi_path_len += strlen(svp->sv_path);
1235         esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1236         if (strcmp(svp->sv_path, "/") != 0)
1237                 (void) strcat(esi->esi_path, svp->sv_path);
1238         (void) strcat(esi->esi_path, stubpath);
1239 
1240         stubpath -= 1;
1241         /* stubpath allocated by fn_path() */
1242         kmem_free(stubpath, strlen(stubpath) + 1);
1243 
1244         nfs_rw_exit(&svp->sv_lock);
1245 
1246         return (esi);
1247 }
1248 
1249 /*
1250  * Makes an upcall to NFSMAPID daemon to resolve hostname of NFS server to
1251  * get network information required to do the mount call.
1252  */
1253 int
1254 nfs4_callmapid(utf8string *server, struct nfs_fsl_info *resp)
1255 {
1256         door_arg_t      door_args;
1257         door_handle_t   dh;
1258         XDR             xdr;
1259         refd_door_args_t *xdr_argsp;
1260         refd_door_res_t  *orig_resp;
1261         k_sigset_t      smask;
1262         int             xdr_len = 0;
1263         int             res_len = 16; /* length of an ip adress */
1264         int             orig_reslen = res_len;
1265         int             error = 0;
1266         struct nfsidmap_globals *nig;
1267 
1268         if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
1269                 return (ECONNREFUSED);
1270 
1271         nig = zone_getspecific(nfsidmap_zone_key, nfs_zone());
1272         ASSERT(nig != NULL);
1273 
1274         mutex_enter(&nig->nfsidmap_daemon_lock);
1275         dh = nig->nfsidmap_daemon_dh;
1276         if (dh == NULL) {
1277                 mutex_exit(&nig->nfsidmap_daemon_lock);
1278                 cmn_err(CE_NOTE,
1279                     "nfs4_callmapid: nfsmapid daemon not " \
1280                     "running unable to resolve host name\n");
1281                 return (EINVAL);
1282         }
1283         door_ki_hold(dh);
1284         mutex_exit(&nig->nfsidmap_daemon_lock);
1285 
1286         xdr_len = xdr_sizeof(&(xdr_utf8string), server);
1287 
1288         xdr_argsp = kmem_zalloc(xdr_len + sizeof (*xdr_argsp), KM_SLEEP);
1289         xdr_argsp->xdr_len = xdr_len;
1290         xdr_argsp->cmd = NFSMAPID_SRV_NETINFO;
1291 
1292         xdrmem_create(&xdr, (char *)&xdr_argsp->xdr_arg,
1293             xdr_len, XDR_ENCODE);
1294 
1295         if (!xdr_utf8string(&xdr, server)) {
1296                 kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1297                 door_ki_rele(dh);
1298                 return (1);
1299         }
1300 
1301         if (orig_reslen)
1302                 orig_resp = kmem_alloc(orig_reslen, KM_SLEEP);
1303 
1304         door_args.data_ptr = (char *)xdr_argsp;
1305         door_args.data_size = sizeof (*xdr_argsp) + xdr_argsp->xdr_len;
1306         door_args.desc_ptr = NULL;
1307         door_args.desc_num = 0;
1308         door_args.rbuf = orig_resp ? (char *)orig_resp : NULL;
1309         door_args.rsize = res_len;
1310 
1311         sigintr(&smask, 1);
1312         error = door_ki_upcall(dh, &door_args);
1313         sigunintr(&smask);
1314 
1315         door_ki_rele(dh);
1316 
1317         kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1318         if (error) {
1319                 kmem_free(orig_resp, orig_reslen);
1320                 /*
1321                  * There is no door to connect to. The referral daemon
1322                  * must not be running yet.
1323                  */
1324                 cmn_err(CE_WARN,
1325                     "nfsmapid not running cannot resolve host name");
1326                 goto out;
1327         }
1328 
1329         /*
1330          * If the results buffer passed back are not the same as
1331          * what was sent free the old buffer and use the new one.
1332          */
1333         if (orig_resp && orig_reslen) {
1334                 refd_door_res_t *door_resp;
1335 
1336                 door_resp = (refd_door_res_t *)door_args.rbuf;
1337                 if ((void *)door_args.rbuf != orig_resp)
1338                         kmem_free(orig_resp, orig_reslen);
1339                 if (door_resp->res_status == 0) {
1340                         xdrmem_create(&xdr, (char *)&door_resp->xdr_res,
1341                             door_resp->xdr_len, XDR_DECODE);
1342                         bzero(resp, sizeof (struct nfs_fsl_info));
1343                         if (!xdr_nfs_fsl_info(&xdr, resp)) {
1344                                 DTRACE_PROBE2(
1345                                     nfs4clnt__debug__referral__upcall__xdrfail,
1346                                     struct nfs_fsl_info *, resp,
1347                                     char *, "nfs4_callmapid");
1348                                 error = EINVAL;
1349                         }
1350                 } else {
1351                         DTRACE_PROBE2(
1352                             nfs4clnt__debug__referral__upcall__badstatus,
1353                             int, door_resp->res_status,
1354                             char *, "nfs4_callmapid");
1355                         error = door_resp->res_status;
1356                 }
1357                 kmem_free(door_args.rbuf, door_args.rsize);
1358         }
1359 out:
1360         DTRACE_PROBE2(nfs4clnt__func__referral__upcall,
1361             char *, server, int, error);
1362         return (error);
1363 }
1364 
1365 /*
1366  * Fetches the fs_locations attribute. Typically called
1367  * from a Replication/Migration/Referrals/Mirror-mount context
1368  *
1369  * Fills in the attributes in garp. The caller is assumed
1370  * to have allocated memory for garp.
1371  *
1372  * lock: if set do not lock s_recovlock and mi_recovlock mutex,
1373  *       it's already done by caller. Otherwise lock these mutexes
1374  *       before doing the rfs4call().
1375  *
1376  * Returns
1377  *      1        for success
1378  *      0        for failure
1379  */
1380 int
1381 nfs4_fetch_locations(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm,
1382     cred_t *cr, nfs4_ga_res_t *garp, COMPOUND4res_clnt *callres, bool_t lock)
1383 {
1384         COMPOUND4args_clnt args;
1385         COMPOUND4res_clnt res;
1386         nfs_argop4 *argop;
1387         int argoplist_size = 3 * sizeof (nfs_argop4);
1388         nfs4_server_t *sp = NULL;
1389         int doqueue = 1;
1390         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1391         int retval = 1;
1392         struct nfs4_clnt *nfscl;
1393 
1394         if (lock == TRUE)
1395                 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1396         else
1397                 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
1398                     nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
1399 
1400         sp = find_nfs4_server(mi);
1401         if (lock == TRUE)
1402                 nfs_rw_exit(&mi->mi_recovlock);
1403 
1404         if (sp != NULL)
1405                 mutex_exit(&sp->s_lock);
1406 
1407         if (lock == TRUE) {
1408                 if (sp != NULL)
1409                         (void) nfs_rw_enter_sig(&sp->s_recovlock,
1410                             RW_WRITER, 0);
1411                 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1412         } else {
1413                 if (sp != NULL) {
1414                         ASSERT(nfs_rw_lock_held(&sp->s_recovlock, RW_READER) ||
1415                             nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
1416                 }
1417         }
1418 
1419         /*
1420          * Do we want to do the setup for recovery here?
1421          *
1422          * We know that the server responded to a null ping a very
1423          * short time ago, and we know that we intend to do a
1424          * single stateless operation - we want to fetch attributes,
1425          * so we know we can't encounter errors about state.  If
1426          * something goes wrong with the GETATTR, like not being
1427          * able to get a response from the server or getting any
1428          * kind of FH error, we should fail the mount.
1429          *
1430          * We may want to re-visited this at a later time.
1431          */
1432         argop = kmem_alloc(argoplist_size, KM_SLEEP);
1433 
1434         args.ctag = TAG_GETATTR_FSLOCATION;
1435         /* PUTFH LOOKUP GETATTR */
1436         args.array_len = 3;
1437         args.array = argop;
1438 
1439         /* 0. putfh file */
1440         argop[0].argop = OP_CPUTFH;
1441         argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1442 
1443         /* 1. lookup name, can't be dotdot */
1444         argop[1].argop = OP_CLOOKUP;
1445         argop[1].nfs_argop4_u.opclookup.cname = nm;
1446 
1447         /* 2. file attrs */
1448         argop[2].argop = OP_GETATTR;
1449         argop[2].nfs_argop4_u.opgetattr.attr_request =
1450             FATTR4_FSID_MASK | FATTR4_FS_LOCATIONS_MASK |
1451             FATTR4_MOUNTED_ON_FILEID_MASK;
1452         argop[2].nfs_argop4_u.opgetattr.mi = mi;
1453 
1454         rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1455 
1456         if (lock == TRUE) {
1457                 nfs_rw_exit(&mi->mi_recovlock);
1458                 if (sp != NULL)
1459                         nfs_rw_exit(&sp->s_recovlock);
1460         }
1461 
1462         nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1463         nfscl->nfscl_stat.referrals.value.ui64++;
1464         DTRACE_PROBE3(nfs4clnt__func__referral__fsloc,
1465             nfs4_sharedfh_t *, sfh, char *, nm, nfs4_error_t *, &e);
1466 
1467         if (e.error != 0) {
1468                 if (sp != NULL)
1469                         nfs4_server_rele(sp);
1470                 kmem_free(argop, argoplist_size);
1471                 return (0);
1472         }
1473 
1474         /*
1475          * Check for all possible error conditions.
1476          * For valid replies without an ops array or for illegal
1477          * replies, return a failure.
1478          */
1479         if (res.status != NFS4_OK || res.array_len < 3 ||
1480             res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
1481                 retval = 0;
1482                 goto exit;
1483         }
1484 
1485         /*
1486          * There isn't much value in putting the attributes
1487          * in the attr cache since fs_locations4 aren't
1488          * encountered very frequently, so just make them
1489          * available to the caller.
1490          */
1491         *garp = res.array[2].nfs_resop4_u.opgetattr.ga_res;
1492 
1493         DTRACE_PROBE2(nfs4clnt__debug__referral__fsloc,
1494             nfs4_ga_res_t *, garp, char *, "nfs4_fetch_locations");
1495 
1496         /* No fs_locations? -- return a failure */
1497         if (garp->n4g_ext_res == NULL ||
1498             garp->n4g_ext_res->n4g_fslocations.locations_val == NULL) {
1499                 retval = 0;
1500                 goto exit;
1501         }
1502 
1503         if (!garp->n4g_fsid_valid)
1504                 retval = 0;
1505 
1506 exit:
1507         if (retval == 0) {
1508                 /* the call was ok but failed validating the call results */
1509                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1510         } else {
1511                 ASSERT(callres != NULL);
1512                 *callres = res;
1513         }
1514 
1515         if (sp != NULL)
1516                 nfs4_server_rele(sp);
1517         kmem_free(argop, argoplist_size);
1518         return (retval);
1519 }
1520 
1521 /* tunable to disable referral mounts */
1522 int nfs4_no_referrals = 0;
1523 
1524 /*
1525  * Returns NULL if the vnode cannot be created or found.
1526  */
1527 vnode_t *
1528 find_referral_stubvp(vnode_t *dvp, char *nm, cred_t *cr)
1529 {
1530         nfs_fh4 *stub_fh, *dfh;
1531         nfs4_sharedfh_t *sfhp;
1532         char *newfhval;
1533         vnode_t *vp = NULL;
1534         fattr4_mounted_on_fileid mnt_on_fileid;
1535         nfs4_ga_res_t garp;
1536         mntinfo4_t *mi;
1537         COMPOUND4res_clnt callres;
1538         hrtime_t t;
1539 
1540         if (nfs4_no_referrals)
1541                 return (NULL);
1542 
1543         /*
1544          * Get the mounted_on_fileid, unique on that server::fsid
1545          */
1546         mi = VTOMI4(dvp);
1547         if (nfs4_fetch_locations(mi, VTOR4(dvp)->r_fh, nm, cr,
1548             &garp, &callres, FALSE) == 0)
1549                 return (NULL);
1550         mnt_on_fileid = garp.n4g_mon_fid;
1551         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1552 
1553         /*
1554          * Build a fake filehandle from the dir FH and the mounted_on_fileid
1555          */
1556         dfh = &VTOR4(dvp)->r_fh->sfh_fh;
1557         stub_fh = kmem_alloc(sizeof (nfs_fh4), KM_SLEEP);
1558         stub_fh->nfs_fh4_val = kmem_alloc(dfh->nfs_fh4_len +
1559             sizeof (fattr4_mounted_on_fileid), KM_SLEEP);
1560         newfhval = stub_fh->nfs_fh4_val;
1561 
1562         /* copy directory's file handle */
1563         bcopy(dfh->nfs_fh4_val, newfhval, dfh->nfs_fh4_len);
1564         stub_fh->nfs_fh4_len = dfh->nfs_fh4_len;
1565         newfhval = newfhval + dfh->nfs_fh4_len;
1566 
1567         /* Add mounted_on_fileid. Use bcopy to avoid alignment problem */
1568         bcopy((char *)&mnt_on_fileid, newfhval,
1569             sizeof (fattr4_mounted_on_fileid));
1570         stub_fh->nfs_fh4_len += sizeof (fattr4_mounted_on_fileid);
1571 
1572         sfhp = sfh4_put(stub_fh, VTOMI4(dvp), NULL);
1573         kmem_free(stub_fh->nfs_fh4_val, dfh->nfs_fh4_len +
1574             sizeof (fattr4_mounted_on_fileid));
1575         kmem_free(stub_fh, sizeof (nfs_fh4));
1576         if (sfhp == NULL)
1577                 return (NULL);
1578 
1579         t = gethrtime();
1580         garp.n4g_va.va_type = VDIR;
1581         vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t,
1582             cr, dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
1583 
1584         if (vp != NULL)
1585                 vp->v_type = VDIR;
1586 
1587         sfh4_rele(&sfhp);
1588         return (vp);
1589 }
1590 
1591 int
1592 nfs4_setup_referral(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1593 {
1594         vnode_t *nvp;
1595         rnode4_t *rp;
1596 
1597         if ((nvp = find_referral_stubvp(dvp, nm, cr)) == NULL)
1598                 return (EINVAL);
1599 
1600         rp = VTOR4(nvp);
1601         mutex_enter(&rp->r_statelock);
1602         r4_stub_referral(rp);
1603         mutex_exit(&rp->r_statelock);
1604         dnlc_enter(dvp, nm, nvp);
1605 
1606         if (*vpp != NULL)
1607                 VN_RELE(*vpp);  /* no longer need this vnode */
1608 
1609         *vpp = nvp;
1610 
1611         return (0);
1612 }
1613 
1614 /*
1615  * Fetch the location information and resolve the new server.
1616  * Caller needs to free up the XDR data which is returned.
1617  * Input: mount info, shared filehandle, nodename
1618  * Return: Index to the result or Error(-1)
1619  * Output: FsLocations Info, Resolved Server Info.
1620  */
1621 int
1622 nfs4_process_referral(mntinfo4_t *mi, nfs4_sharedfh_t *sfh,
1623     char *nm, cred_t *cr, nfs4_ga_res_t *grp, COMPOUND4res_clnt *res,
1624     struct nfs_fsl_info *fsloc)
1625 {
1626         fs_location4 *fsp;
1627         struct nfs_fsl_info nfsfsloc;
1628         int ret, i, error;
1629         nfs4_ga_res_t garp;
1630         COMPOUND4res_clnt callres;
1631         struct knetconfig *knc;
1632 
1633         ret = nfs4_fetch_locations(mi, sfh, nm, cr, &garp, &callres, TRUE);
1634         if (ret == 0)
1635                 return (-1);
1636 
1637         /*
1638          * As a lame attempt to figuring out if we're
1639          * handling a migration event or a referral,
1640          * look for rnodes with this fsid in the rnode
1641          * cache.
1642          *
1643          * If we can find one or more such rnodes, it
1644          * means we're handling a migration event and
1645          * we want to bail out in that case.
1646          */
1647         if (r4find_by_fsid(mi, &garp.n4g_fsid)) {
1648                 DTRACE_PROBE3(nfs4clnt__debug__referral__migration,
1649                     mntinfo4_t *, mi, nfs4_ga_res_t *, &garp,
1650                     char *, "nfs4_process_referral");
1651                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1652                 return (-1);
1653         }
1654 
1655         /*
1656          * Find the first responsive server to mount.  When we find
1657          * one, fsp will point to it.
1658          */
1659         for (i = 0; i < garp.n4g_ext_res->n4g_fslocations.locations_len; i++) {
1660 
1661                 fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[i];
1662                 if (fsp->server_len == 0 || fsp->server_val == NULL)
1663                         continue;
1664 
1665                 error = nfs4_callmapid(fsp->server_val, &nfsfsloc);
1666                 if (error != 0)
1667                         continue;
1668 
1669                 error = nfs4_ping_server_common(nfsfsloc.knconf,
1670                     nfsfsloc.addr, !(mi->mi_flags & MI4_INT));
1671                 if (error == RPC_SUCCESS)
1672                         break;
1673 
1674                 DTRACE_PROBE2(nfs4clnt__debug__referral__srvaddr,
1675                     sockaddr_in *, (struct sockaddr_in *)nfsfsloc.addr->buf,
1676                     char *, "nfs4_process_referral");
1677 
1678                 (void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1679         }
1680         knc = nfsfsloc.knconf;
1681         if ((i >= garp.n4g_ext_res->n4g_fslocations.locations_len) ||
1682             (knc->knc_protofmly == NULL) || (knc->knc_proto == NULL)) {
1683                 DTRACE_PROBE2(nfs4clnt__debug__referral__nofsloc,
1684                     nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral");
1685                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1686                 return (-1);
1687         }
1688 
1689         /* Send the results back */
1690         *fsloc = nfsfsloc;
1691         *grp = garp;
1692         *res = callres;
1693         return (i);
1694 }
1695 
1696 /*
1697  * Referrals case - need to fetch referral data and then upcall to
1698  * user-level to get complete mount data.
1699  */
1700 static ephemeral_servinfo_t *
1701 nfs4_trigger_esi_create_referral(vnode_t *vp, cred_t *cr)
1702 {
1703         struct knetconfig       *sikncp, *svkncp;
1704         struct netbuf           *bufp;
1705         ephemeral_servinfo_t    *esi;
1706         vnode_t                 *dvp;
1707         rnode4_t                *drp;
1708         fs_location4            *fsp;
1709         struct nfs_fsl_info     nfsfsloc;
1710         nfs4_ga_res_t           garp;
1711         char                    *p;
1712         char                    fn[MAXNAMELEN];
1713         int                     i, index = -1;
1714         mntinfo4_t              *mi;
1715         COMPOUND4res_clnt       callres;
1716 
1717         /*
1718          * If we're passed in a stub vnode that
1719          * isn't a "referral" stub, bail out
1720          * and return a failure
1721          */
1722         if (!RP_ISSTUB_REFERRAL(VTOR4(vp)))
1723                 return (NULL);
1724 
1725         if (vtodv(vp, &dvp, CRED(), TRUE) != 0)
1726                 return (NULL);
1727 
1728         drp = VTOR4(dvp);
1729         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
1730                 VN_RELE(dvp);
1731                 return (NULL);
1732         }
1733 
1734         if (vtoname(vp, fn, MAXNAMELEN) != 0) {
1735                 nfs_rw_exit(&drp->r_rwlock);
1736                 VN_RELE(dvp);
1737                 return (NULL);
1738         }
1739 
1740         mi = VTOMI4(dvp);
1741         index = nfs4_process_referral(mi, drp->r_fh, fn, cr,
1742             &garp, &callres, &nfsfsloc);
1743         nfs_rw_exit(&drp->r_rwlock);
1744         VN_RELE(dvp);
1745         if (index < 0)
1746                 return (NULL);
1747 
1748         fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1749         esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1750 
1751         /* initially set to be our type of ephemeral mount; may be added to */
1752         esi->esi_mount_flags = NFSMNT_REFERRAL;
1753 
1754         esi->esi_hostname =
1755             kmem_zalloc(fsp->server_val->utf8string_len + 1, KM_SLEEP);
1756         bcopy(fsp->server_val->utf8string_val, esi->esi_hostname,
1757             fsp->server_val->utf8string_len);
1758         esi->esi_hostname[fsp->server_val->utf8string_len] = '\0';
1759 
1760         bufp = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
1761         bufp->len = nfsfsloc.addr->len;
1762         bufp->maxlen = nfsfsloc.addr->maxlen;
1763         bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1764         bcopy(nfsfsloc.addr->buf, bufp->buf, bufp->len);
1765         esi->esi_addr = bufp;
1766 
1767         esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1768         sikncp = esi->esi_knconf;
1769 
1770         DTRACE_PROBE2(nfs4clnt__debug__referral__nfsfsloc,
1771             struct nfs_fsl_info *, &nfsfsloc,
1772             char *, "nfs4_trigger_esi_create_referral");
1773 
1774         svkncp = nfsfsloc.knconf;
1775         sikncp->knc_semantics = svkncp->knc_semantics;
1776         sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1777         (void) strlcat((char *)sikncp->knc_protofmly,
1778             (char *)svkncp->knc_protofmly, KNC_STRSIZE);
1779         sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1780         (void) strlcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto,
1781             KNC_STRSIZE);
1782         sikncp->knc_rdev = svkncp->knc_rdev;
1783 
1784         DTRACE_PROBE2(nfs4clnt__debug__referral__knetconf,
1785             struct knetconfig *, sikncp,
1786             char *, "nfs4_trigger_esi_create_referral");
1787 
1788         esi->esi_netname = kmem_zalloc(nfsfsloc.netnm_len, KM_SLEEP);
1789         bcopy(nfsfsloc.netname, esi->esi_netname, nfsfsloc.netnm_len);
1790         esi->esi_syncaddr = NULL;
1791 
1792         esi->esi_path = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1793         esi->esi_path_len = MAXPATHLEN;
1794         *p++ = '/';
1795         for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1796                 component4 *comp;
1797 
1798                 comp = &fsp->rootpath.pathname4_val[i];
1799                 /* If no space, null the string and bail */
1800                 if ((p - esi->esi_path) + comp->utf8string_len + 1 > MAXPATHLEN)
1801                         goto err;
1802                 bcopy(comp->utf8string_val, p, comp->utf8string_len);
1803                 p += comp->utf8string_len;
1804                 *p++ = '/';
1805         }
1806         if (fsp->rootpath.pathname4_len != 0)
1807                 *(p - 1) = '\0';
1808         else
1809                 *p = '\0';
1810         p = esi->esi_path;
1811         esi->esi_path = strdup(p);
1812         esi->esi_path_len = strlen(p) + 1;
1813         kmem_free(p, MAXPATHLEN);
1814 
1815         /* Allocated in nfs4_process_referral() */
1816         (void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1817         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1818 
1819         return (esi);
1820 err:
1821         kmem_free(esi->esi_path, esi->esi_path_len);
1822         kmem_free(esi->esi_hostname, fsp->server_val->utf8string_len + 1);
1823         kmem_free(esi->esi_addr->buf, esi->esi_addr->len);
1824         kmem_free(esi->esi_addr, sizeof (struct netbuf));
1825         kmem_free(esi->esi_knconf->knc_protofmly, KNC_STRSIZE);
1826         kmem_free(esi->esi_knconf->knc_proto, KNC_STRSIZE);
1827         kmem_free(esi->esi_knconf, sizeof (*esi->esi_knconf));
1828         kmem_free(esi->esi_netname, nfsfsloc.netnm_len);
1829         kmem_free(esi, sizeof (ephemeral_servinfo_t));
1830         (void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1831         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1832         return (NULL);
1833 }
1834 
1835 /*
1836  * Assemble the args, and call the generic VFS mount function to
1837  * finally perform the ephemeral mount.
1838  */
1839 static int
1840 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1841     cred_t *cr, vnode_t **newvpp)
1842 {
1843         struct mounta   *uap;
1844         char            *mntpt, *orig_path, *path;
1845         const char      *orig_mntpt;
1846         int             retval;
1847         int             mntpt_len;
1848         int             spec_len;
1849         zone_t          *zone = curproc->p_zone;
1850         bool_t          has_leading_slash;
1851         int             i;
1852 
1853         vfs_t                   *stubvfsp = stubvp->v_vfsp;
1854         ephemeral_servinfo_t    *esi = dma->dma_esi;
1855         struct nfs_args         *nargs = dma->dma_nargs;
1856 
1857         /* first, construct the mount point for the ephemeral mount */
1858         orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1859         orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1860 
1861         if (*orig_path == '.')
1862                 orig_path++;
1863 
1864         /*
1865          * Get rid of zone's root path
1866          */
1867         if (zone != global_zone) {
1868                 /*
1869                  * -1 for trailing '/' and -1 for EOS.
1870                  */
1871                 if (strncmp(zone->zone_rootpath, orig_mntpt,
1872                     zone->zone_rootpathlen - 1) == 0) {
1873                         orig_mntpt += (zone->zone_rootpathlen - 2);
1874                 }
1875         }
1876 
1877         mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1878         mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1879         (void) strcat(mntpt, orig_mntpt);
1880         (void) strcat(mntpt, orig_path);
1881 
1882         kmem_free(path, strlen(path) + 1);
1883         path = esi->esi_path;
1884         if (*path == '.')
1885                 path++;
1886         if (path[0] == '/' && path[1] == '/')
1887                 path++;
1888         has_leading_slash = (*path == '/');
1889 
1890         spec_len = strlen(dma->dma_hostlist);
1891         spec_len += strlen(path);
1892 
1893         /* We are going to have to add this in */
1894         if (!has_leading_slash)
1895                 spec_len++;
1896 
1897         /* We need to get the ':' for dma_hostlist:esi_path */
1898         spec_len++;
1899 
1900         uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1901         uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1902         (void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1903             has_leading_slash ? "" : "/", path);
1904 
1905         uap->dir = mntpt;
1906 
1907         uap->flags = MS_SYSSPACE | MS_DATA;
1908         /* fstype-independent mount options not covered elsewhere */
1909         /* copy parent's mount(1M) "-m" flag */
1910         if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1911                 uap->flags |= MS_NOMNTTAB;
1912 
1913         uap->fstype = MNTTYPE_NFS4;
1914         uap->dataptr = (char *)nargs;
1915         /* not needed for MS_SYSSPACE */
1916         uap->datalen = 0;
1917 
1918         /* use optptr to pass in extra mount options */
1919         uap->flags |= MS_OPTIONSTR;
1920         uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1921         if (uap->optptr == NULL) {
1922                 retval = EINVAL;
1923                 goto done;
1924         }
1925 
1926         /* domount() expects us to count the trailing NUL */
1927         uap->optlen = strlen(uap->optptr) + 1;
1928 
1929         /*
1930          * If we get EBUSY, we try again once to see if we can perform
1931          * the mount. We do this because of a spurious race condition.
1932          */
1933         for (i = 0; i < 2; i++) {
1934                 int     error;
1935                 bool_t  was_mounted;
1936 
1937                 retval = domount(NULL, uap, stubvp, cr, vfsp);
1938                 if (retval == 0) {
1939                         retval = VFS_ROOT(*vfsp, newvpp);
1940                         VFS_RELE(*vfsp);
1941                         break;
1942                 } else if (retval != EBUSY) {
1943                         break;
1944                 }
1945 
1946                 /*
1947                  * We might find it mounted by the other racer...
1948                  */
1949                 error = nfs4_trigger_mounted_already(stubvp,
1950                     newvpp, &was_mounted, vfsp);
1951                 if (error) {
1952                         goto done;
1953                 } else if (was_mounted) {
1954                         retval = 0;
1955                         break;
1956                 }
1957         }
1958 
1959 done:
1960         if (uap->optptr)
1961                 nfs4_trigger_destroy_mntopts(uap->optptr);
1962 
1963         kmem_free(uap->spec, spec_len + 1);
1964         kmem_free(uap, sizeof (struct mounta));
1965         kmem_free(mntpt, mntpt_len + 1);
1966 
1967         return (retval);
1968 }
1969 
1970 /*
1971  * Build an nfs_args structure for passing to domount().
1972  *
1973  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1974  * generic data - common to all ephemeral mount types - is read directly
1975  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1976  */
1977 static struct nfs_args *
1978 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1979     ephemeral_servinfo_t *esi)
1980 {
1981         sec_data_t *secdata;
1982         struct nfs_args *nargs;
1983 
1984         /* setup the nfs args */
1985         nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1986 
1987         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1988 
1989         nargs->addr = esi->esi_addr;
1990 
1991         /* for AUTH_DH by negotiation */
1992         if (esi->esi_syncaddr || esi->esi_netname) {
1993                 nargs->flags |= NFSMNT_SECURE;
1994                 nargs->syncaddr = esi->esi_syncaddr;
1995                 nargs->netname = esi->esi_netname;
1996         }
1997 
1998         nargs->flags |= NFSMNT_KNCONF;
1999         nargs->knconf = esi->esi_knconf;
2000         nargs->flags |= NFSMNT_HOSTNAME;
2001         nargs->hostname = esi->esi_hostname;
2002         nargs->fh = esi->esi_path;
2003 
2004         /* general mount settings, all copied from parent mount */
2005         mutex_enter(&mi->mi_lock);
2006 
2007         if (!(mi->mi_flags & MI4_HARD))
2008                 nargs->flags |= NFSMNT_SOFT;
2009 
2010         nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
2011             NFSMNT_RETRANS;
2012         nargs->wsize = mi->mi_stsize;
2013         nargs->rsize = mi->mi_tsize;
2014         nargs->timeo = mi->mi_timeo;
2015         nargs->retrans = mi->mi_retrans;
2016 
2017         if (mi->mi_flags & MI4_INT)
2018                 nargs->flags |= NFSMNT_INT;
2019         if (mi->mi_flags & MI4_NOAC)
2020                 nargs->flags |= NFSMNT_NOAC;
2021 
2022         nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
2023             NFSMNT_ACDIRMAX;
2024         nargs->acregmin = HR2SEC(mi->mi_acregmin);
2025         nargs->acregmax = HR2SEC(mi->mi_acregmax);
2026         nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
2027         nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
2028 
2029         /* add any specific flags for this type of ephemeral mount */
2030         nargs->flags |= esi->esi_mount_flags;
2031 
2032         if (mi->mi_flags & MI4_NOCTO)
2033                 nargs->flags |= NFSMNT_NOCTO;
2034         if (mi->mi_flags & MI4_GRPID)
2035                 nargs->flags |= NFSMNT_GRPID;
2036         if (mi->mi_flags & MI4_LLOCK)
2037                 nargs->flags |= NFSMNT_LLOCK;
2038         if (mi->mi_flags & MI4_NOPRINT)
2039                 nargs->flags |= NFSMNT_NOPRINT;
2040         if (mi->mi_flags & MI4_DIRECTIO)
2041                 nargs->flags |= NFSMNT_DIRECTIO;
2042         if (mi->mi_flags & MI4_PUBLIC && nargs->flags & NFSMNT_MIRRORMOUNT)
2043                 nargs->flags |= NFSMNT_PUBLIC;
2044 
2045         /* Do some referral-specific option tweaking */
2046         if (nargs->flags & NFSMNT_REFERRAL) {
2047                 nargs->flags &= ~NFSMNT_DORDMA;
2048                 nargs->flags |= NFSMNT_TRYRDMA;
2049         }
2050 
2051         mutex_exit(&mi->mi_lock);
2052 
2053         /*
2054          * Security data & negotiation policy.
2055          *
2056          * For mirror mounts, we need to preserve the parent mount's
2057          * preference for security negotiation, translating SV4_TRYSECDEFAULT
2058          * to NFSMNT_SECDEFAULT if present.
2059          *
2060          * For referrals, we always want security negotiation and will
2061          * set NFSMNT_SECDEFAULT and we will not copy current secdata.
2062          * The reason is that we can't negotiate down from a parent's
2063          * Kerberos flavor to AUTH_SYS.
2064          *
2065          * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
2066          * security flavour was requested, with data in sv_secdata, and that
2067          * no negotiation should occur. If this specified flavour fails, that's
2068          * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
2069          *
2070          * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
2071          * default flavour, in sv_secdata, but then negotiate a new flavour.
2072          * Possible flavours are recorded in an array in sv_secinfo, with
2073          * currently in-use flavour pointed to by sv_currsec.
2074          *
2075          * If sv_currsec is set, i.e. if negotiation has already occurred,
2076          * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
2077          * we will set NFSMNT_SECDEFAULT, to enable negotiation.
2078          */
2079         if (nargs->flags & NFSMNT_REFERRAL) {
2080                 /* enable negotiation for referral mount */
2081                 nargs->flags |= NFSMNT_SECDEFAULT;
2082                 secdata = kmem_alloc(sizeof (sec_data_t), KM_SLEEP);
2083                 secdata->secmod = secdata->rpcflavor = AUTH_SYS;
2084                 secdata->data = NULL;
2085         } else if (svp->sv_flags & SV4_TRYSECDEFAULT) {
2086                 /* enable negotiation for mirror mount */
2087                 nargs->flags |= NFSMNT_SECDEFAULT;
2088 
2089                 /*
2090                  * As a starting point for negotiation, copy parent
2091                  * mount's negotiated flavour (sv_currsec) if available,
2092                  * or its passed-in flavour (sv_secdata) if not.
2093                  */
2094                 if (svp->sv_currsec != NULL)
2095                         secdata = copy_sec_data(svp->sv_currsec);
2096                 else if (svp->sv_secdata != NULL)
2097                         secdata = copy_sec_data(svp->sv_secdata);
2098                 else
2099                         secdata = NULL;
2100         } else {
2101                 /* do not enable negotiation; copy parent's passed-in flavour */
2102                 if (svp->sv_secdata != NULL)
2103                         secdata = copy_sec_data(svp->sv_secdata);
2104                 else
2105                         secdata = NULL;
2106         }
2107 
2108         nfs_rw_exit(&svp->sv_lock);
2109 
2110         nargs->flags |= NFSMNT_NEWARGS;
2111         nargs->nfs_args_ext = NFS_ARGS_EXTB;
2112         nargs->nfs_ext_u.nfs_extB.secdata = secdata;
2113 
2114         /* for NFS RO failover; caller will set if necessary */
2115         nargs->nfs_ext_u.nfs_extB.next = NULL;
2116 
2117         return (nargs);
2118 }
2119 
2120 static void
2121 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
2122 {
2123         /*
2124          * Either the mount failed, in which case the data is not needed, or
2125          * nfs4_mount() has either taken copies of what it needs or,
2126          * where it has merely copied the ptr, it has set *our* ptr to NULL,
2127          * whereby nfs4_free_args() will ignore it.
2128          */
2129         nfs4_free_args(nargs);
2130         kmem_free(nargs, sizeof (struct nfs_args));
2131 }
2132 
2133 /*
2134  * When we finally get into the mounting, we need to add this
2135  * node to the ephemeral tree.
2136  *
2137  * This is called from nfs4_mount().
2138  */
2139 int
2140 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
2141 {
2142         mntinfo4_t              *mi_parent;
2143         nfs4_ephemeral_t        *eph;
2144         nfs4_ephemeral_tree_t   *net;
2145 
2146         nfs4_ephemeral_t        *prior;
2147         nfs4_ephemeral_t        *child;
2148 
2149         nfs4_ephemeral_t        *peer;
2150 
2151         nfs4_trigger_globals_t  *ntg;
2152         zone_t                  *zone = curproc->p_zone;
2153 
2154         int                     rc = 0;
2155 
2156         mi_parent = VTOMI4(mvp);
2157 
2158         /*
2159          * Get this before grabbing anything else!
2160          */
2161         ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2162         if (!ntg->ntg_thread_started) {
2163                 nfs4_ephemeral_start_harvester(ntg);
2164         }
2165 
2166         mutex_enter(&mi_parent->mi_lock);
2167         mutex_enter(&mi->mi_lock);
2168 
2169         net = mi->mi_ephemeral_tree =
2170             mi_parent->mi_ephemeral_tree;
2171 
2172         /*
2173          * If the mi_ephemeral_tree is NULL, then it
2174          * means that either the harvester or a manual
2175          * umount has cleared the tree out right before
2176          * we got here.
2177          *
2178          * There is nothing we can do here, so return
2179          * to the caller and let them decide whether they
2180          * try again.
2181          */
2182         if (net == NULL) {
2183                 mutex_exit(&mi->mi_lock);
2184                 mutex_exit(&mi_parent->mi_lock);
2185 
2186                 return (EBUSY);
2187         }
2188 
2189         /*
2190          * We've just tied the mntinfo to the tree, so
2191          * now we bump the refcnt and hold it there until
2192          * this mntinfo is removed from the tree.
2193          */
2194         nfs4_ephemeral_tree_hold(net);
2195 
2196         /*
2197          * We need to tack together the ephemeral mount
2198          * with this new mntinfo.
2199          */
2200         eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
2201         eph->ne_mount = mi;
2202         MI4_HOLD(mi);
2203         VFS_HOLD(mi->mi_vfsp);
2204         eph->ne_ref_time = gethrestime_sec();
2205 
2206         /*
2207          * We need to tell the ephemeral mount when
2208          * to time out.
2209          */
2210         eph->ne_mount_to = ntg->ntg_mount_to;
2211 
2212         mi->mi_ephemeral = eph;
2213 
2214         /*
2215          * If the enclosing mntinfo4 is also ephemeral,
2216          * then we need to point to its enclosing parent.
2217          * Else the enclosing mntinfo4 is the enclosing parent.
2218          *
2219          * We also need to weave this ephemeral node
2220          * into the tree.
2221          */
2222         if (mi_parent->mi_flags & MI4_EPHEMERAL) {
2223                 /*
2224                  * We need to decide if we are
2225                  * the root node of this branch
2226                  * or if we are a sibling of this
2227                  * branch.
2228                  */
2229                 prior = mi_parent->mi_ephemeral;
2230                 if (prior == NULL) {
2231                         /*
2232                          * Race condition, clean up, and
2233                          * let caller handle mntinfo.
2234                          */
2235                         mi->mi_flags &= ~MI4_EPHEMERAL;
2236                         mi->mi_ephemeral = NULL;
2237                         kmem_free(eph, sizeof (*eph));
2238                         VFS_RELE(mi->mi_vfsp);
2239                         MI4_RELE(mi);
2240                         nfs4_ephemeral_tree_rele(net);
2241                         rc = EBUSY;
2242                 } else {
2243                         if (prior->ne_child == NULL) {
2244                                 prior->ne_child = eph;
2245                         } else {
2246                                 child = prior->ne_child;
2247 
2248                                 prior->ne_child = eph;
2249                                 eph->ne_peer = child;
2250 
2251                                 child->ne_prior = eph;
2252                         }
2253 
2254                         eph->ne_prior = prior;
2255                 }
2256         } else {
2257                 /*
2258                  * The parent mntinfo4 is the non-ephemeral
2259                  * root of the ephemeral tree. We
2260                  * need to decide if we are the root
2261                  * node of that tree or if we are a
2262                  * sibling of the root node.
2263                  *
2264                  * We are the root if there is no
2265                  * other node.
2266                  */
2267                 if (net->net_root == NULL) {
2268                         net->net_root = eph;
2269                 } else {
2270                         eph->ne_peer = peer = net->net_root;
2271                         ASSERT(peer != NULL);
2272                         net->net_root = eph;
2273 
2274                         peer->ne_prior = eph;
2275                 }
2276 
2277                 eph->ne_prior = NULL;
2278         }
2279 
2280         mutex_exit(&mi->mi_lock);
2281         mutex_exit(&mi_parent->mi_lock);
2282 
2283         return (rc);
2284 }
2285 
2286 /*
2287  * Commit the changes to the ephemeral tree for removing this node.
2288  */
2289 static void
2290 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
2291 {
2292         nfs4_ephemeral_t        *e = eph;
2293         nfs4_ephemeral_t        *peer;
2294         nfs4_ephemeral_t        *prior;
2295 
2296         peer = eph->ne_peer;
2297         prior = e->ne_prior;
2298 
2299         /*
2300          * If this branch root was not the
2301          * tree root, then we need to fix back pointers.
2302          */
2303         if (prior) {
2304                 if (prior->ne_child == e) {
2305                         prior->ne_child = peer;
2306                 } else {
2307                         prior->ne_peer = peer;
2308                 }
2309 
2310                 if (peer)
2311                         peer->ne_prior = prior;
2312         } else if (peer) {
2313                 peer->ne_mount->mi_ephemeral_tree->net_root = peer;
2314                 peer->ne_prior = NULL;
2315         } else {
2316                 e->ne_mount->mi_ephemeral_tree->net_root = NULL;
2317         }
2318 }
2319 
2320 /*
2321  * We want to avoid recursion at all costs. So we need to
2322  * unroll the tree. We do this by a depth first traversal to
2323  * leaf nodes. We blast away the leaf and work our way back
2324  * up and down the tree.
2325  */
2326 static int
2327 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
2328     int isTreeRoot, int flag, cred_t *cr)
2329 {
2330         nfs4_ephemeral_t        *e = eph;
2331         nfs4_ephemeral_t        *prior;
2332         mntinfo4_t              *mi;
2333         vfs_t                   *vfsp;
2334         int                     error;
2335 
2336         /*
2337          * We use the loop while unrolling the ephemeral tree.
2338          */
2339         for (;;) {
2340                 /*
2341                  * First we walk down the child.
2342                  */
2343                 if (e->ne_child) {
2344                         prior = e;
2345                         e = e->ne_child;
2346                         continue;
2347                 }
2348 
2349                 /*
2350                  * If we are the root of the branch we are removing,
2351                  * we end it here. But if the branch is the root of
2352                  * the tree, we have to forge on. We do not consider
2353                  * the peer list for the root because while it may
2354                  * be okay to remove, it is both extra work and a
2355                  * potential for a false-positive error to stall the
2356                  * unmount attempt.
2357                  */
2358                 if (e == eph && isTreeRoot == FALSE)
2359                         return (0);
2360 
2361                 /*
2362                  * Next we walk down the peer list.
2363                  */
2364                 if (e->ne_peer) {
2365                         prior = e;
2366                         e = e->ne_peer;
2367                         continue;
2368                 }
2369 
2370                 /*
2371                  * We can only remove the node passed in by the
2372                  * caller if it is the root of the ephemeral tree.
2373                  * Otherwise, the caller will remove it.
2374                  */
2375                 if (e == eph && isTreeRoot == FALSE)
2376                         return (0);
2377 
2378                 /*
2379                  * Okay, we have a leaf node, time
2380                  * to prune it!
2381                  *
2382                  * Note that prior can only be NULL if
2383                  * and only if it is the root of the
2384                  * ephemeral tree.
2385                  */
2386                 prior = e->ne_prior;
2387 
2388                 mi = e->ne_mount;
2389                 mutex_enter(&mi->mi_lock);
2390                 vfsp = mi->mi_vfsp;
2391                 ASSERT(vfsp != NULL);
2392 
2393                 /*
2394                  * Cleared by umount2_engine.
2395                  */
2396                 VFS_HOLD(vfsp);
2397 
2398                 /*
2399                  * Inform nfs4_unmount to not recursively
2400                  * descend into this node's children when it
2401                  * gets processed.
2402                  */
2403                 mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
2404                 mutex_exit(&mi->mi_lock);
2405 
2406                 error = umount2_engine(vfsp, flag, cr, FALSE);
2407                 if (error) {
2408                         /*
2409                          * We need to reenable nfs4_unmount's ability
2410                          * to recursively descend on this node.
2411                          */
2412                         mutex_enter(&mi->mi_lock);
2413                         mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
2414                         mutex_exit(&mi->mi_lock);
2415 
2416                         return (error);
2417                 }
2418 
2419                 /*
2420                  * If we are the current node, we do not want to
2421                  * touch anything else. At this point, the only
2422                  * way the current node can have survived to here
2423                  * is if it is the root of the ephemeral tree and
2424                  * we are unmounting the enclosing mntinfo4.
2425                  */
2426                 if (e == eph) {
2427                         ASSERT(prior == NULL);
2428                         return (0);
2429                 }
2430 
2431                 /*
2432                  * Stitch up the prior node. Note that since
2433                  * we have handled the root of the tree, prior
2434                  * must be non-NULL.
2435                  */
2436                 ASSERT(prior != NULL);
2437                 if (prior->ne_child == e) {
2438                         prior->ne_child = NULL;
2439                 } else {
2440                         ASSERT(prior->ne_peer == e);
2441 
2442                         prior->ne_peer = NULL;
2443                 }
2444 
2445                 e = prior;
2446         }
2447 
2448         /* NOTREACHED */
2449 }
2450 
2451 /*
2452  * Common code to safely release net_cnt_lock and net_tree_lock
2453  */
2454 void
2455 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
2456     nfs4_ephemeral_tree_t **pnet)
2457 {
2458         nfs4_ephemeral_tree_t   *net = *pnet;
2459 
2460         if (*pmust_unlock) {
2461                 mutex_enter(&net->net_cnt_lock);
2462                 net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
2463                 mutex_exit(&net->net_cnt_lock);
2464 
2465                 mutex_exit(&net->net_tree_lock);
2466 
2467                 *pmust_unlock = FALSE;
2468         }
2469 }
2470 
2471 /*
2472  * While we may have removed any child or sibling nodes of this
2473  * ephemeral node, we can not nuke it until we know that there
2474  * were no actived vnodes on it. This will do that final
2475  * work once we know it is not busy.
2476  */
2477 void
2478 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
2479     nfs4_ephemeral_tree_t **pnet)
2480 {
2481         /*
2482          * Now we need to get rid of the ephemeral data if it exists.
2483          */
2484         mutex_enter(&mi->mi_lock);
2485         if (mi->mi_ephemeral) {
2486                 /*
2487                  * If we are the root node of an ephemeral branch
2488                  * which is being removed, then we need to fixup
2489                  * pointers into and out of the node.
2490                  */
2491                 if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
2492                         nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
2493 
2494                 nfs4_ephemeral_tree_rele(*pnet);
2495                 ASSERT(mi->mi_ephemeral != NULL);
2496 
2497                 kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
2498                 mi->mi_ephemeral = NULL;
2499                 VFS_RELE(mi->mi_vfsp);
2500                 MI4_RELE(mi);
2501         }
2502         mutex_exit(&mi->mi_lock);
2503 
2504         nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2505 }
2506 
2507 /*
2508  * Unmount an ephemeral node.
2509  *
2510  * Note that if this code fails, then it must unlock.
2511  *
2512  * If it succeeds, then the caller must be prepared to do so.
2513  */
2514 int
2515 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
2516     bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
2517 {
2518         int                     error = 0;
2519         nfs4_ephemeral_t        *eph;
2520         nfs4_ephemeral_tree_t   *net;
2521         int                     is_derooting = FALSE;
2522         int                     is_recursed = FALSE;
2523         int                     was_locked = FALSE;
2524 
2525         /*
2526          * Make sure to set the default state for cleaning
2527          * up the tree in the caller (and on the way out).
2528          */
2529         *pmust_unlock = FALSE;
2530 
2531         /*
2532          * The active vnodes on this file system may be ephemeral
2533          * children. We need to check for and try to unmount them
2534          * here. If any can not be unmounted, we are going
2535          * to return EBUSY.
2536          */
2537         mutex_enter(&mi->mi_lock);
2538 
2539         /*
2540          * If an ephemeral tree, we need to check to see if
2541          * the lock is already held. If it is, then we need
2542          * to see if we are being called as a result of
2543          * the recursive removal of some node of the tree or
2544          * if we are another attempt to remove the tree.
2545          *
2546          * mi_flags & MI4_EPHEMERAL indicates an ephemeral
2547          * node. mi_ephemeral being non-NULL also does this.
2548          *
2549          * mi_ephemeral_tree being non-NULL is sufficient
2550          * to also indicate either it is an ephemeral node
2551          * or the enclosing mntinfo4.
2552          *
2553          * Do we need MI4_EPHEMERAL? Yes, it is useful for
2554          * when we delete the ephemeral node and need to
2555          * differentiate from an ephemeral node and the
2556          * enclosing root node.
2557          */
2558         *pnet = net = mi->mi_ephemeral_tree;
2559         if (net == NULL) {
2560                 mutex_exit(&mi->mi_lock);
2561                 return (0);
2562         }
2563 
2564         eph = mi->mi_ephemeral;
2565         is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
2566         is_derooting = (eph == NULL);
2567 
2568         mutex_enter(&net->net_cnt_lock);
2569 
2570         /*
2571          * If this is not recursion, then we need to
2572          * check to see if a harvester thread has
2573          * already grabbed the lock.
2574          *
2575          * After we exit this branch, we may not
2576          * blindly return, we need to jump to
2577          * is_busy!
2578          */
2579         if (!is_recursed) {
2580                 if (net->net_status &
2581                     NFS4_EPHEMERAL_TREE_LOCKED) {
2582                         /*
2583                          * If the tree is locked, we need
2584                          * to decide whether we are the
2585                          * harvester or some explicit call
2586                          * for a umount. The only way that
2587                          * we are the harvester is if
2588                          * MS_SYSSPACE is set.
2589                          *
2590                          * We only let the harvester through
2591                          * at this point.
2592                          *
2593                          * We return EBUSY so that the
2594                          * caller knows something is
2595                          * going on. Note that by that
2596                          * time, the umount in the other
2597                          * thread may have already occured.
2598                          */
2599                         if (!(flag & MS_SYSSPACE)) {
2600                                 mutex_exit(&net->net_cnt_lock);
2601                                 mutex_exit(&mi->mi_lock);
2602 
2603                                 return (EBUSY);
2604                         }
2605 
2606                         was_locked = TRUE;
2607                 }
2608         }
2609 
2610         mutex_exit(&net->net_cnt_lock);
2611         mutex_exit(&mi->mi_lock);
2612 
2613         /*
2614          * If we are not the harvester, we need to check
2615          * to see if we need to grab the tree lock.
2616          */
2617         if (was_locked == FALSE) {
2618                 /*
2619                  * If we grab the lock, it means that no other
2620                  * operation is working on the tree. If we don't
2621                  * grab it, we need to decide if this is because
2622                  * we are a recursive call or a new operation.
2623                  */
2624                 if (mutex_tryenter(&net->net_tree_lock)) {
2625                         *pmust_unlock = TRUE;
2626                 } else {
2627                         /*
2628                          * If we are a recursive call, we can
2629                          * proceed without the lock.
2630                          * Otherwise we have to wait until
2631                          * the lock becomes free.
2632                          */
2633                         if (!is_recursed) {
2634                                 mutex_enter(&net->net_cnt_lock);
2635                                 if (net->net_status &
2636                                     (NFS4_EPHEMERAL_TREE_DEROOTING
2637                                     | NFS4_EPHEMERAL_TREE_INVALID)) {
2638                                         mutex_exit(&net->net_cnt_lock);
2639                                         goto is_busy;
2640                                 }
2641                                 mutex_exit(&net->net_cnt_lock);
2642 
2643                                 /*
2644                                  * We can't hold any other locks whilst
2645                                  * we wait on this to free up.
2646                                  */
2647                                 mutex_enter(&net->net_tree_lock);
2648 
2649                                 /*
2650                                  * Note that while mi->mi_ephemeral
2651                                  * may change and thus we have to
2652                                  * update eph, it is the case that
2653                                  * we have tied down net and
2654                                  * do not care if mi->mi_ephemeral_tree
2655                                  * has changed.
2656                                  */
2657                                 mutex_enter(&mi->mi_lock);
2658                                 eph = mi->mi_ephemeral;
2659                                 mutex_exit(&mi->mi_lock);
2660 
2661                                 /*
2662                                  * Okay, we need to see if either the
2663                                  * tree got nuked or the current node
2664                                  * got nuked. Both of which will cause
2665                                  * an error.
2666                                  *
2667                                  * Note that a subsequent retry of the
2668                                  * umount shall work.
2669                                  */
2670                                 mutex_enter(&net->net_cnt_lock);
2671                                 if (net->net_status &
2672                                     NFS4_EPHEMERAL_TREE_INVALID ||
2673                                     (!is_derooting && eph == NULL)) {
2674                                         mutex_exit(&net->net_cnt_lock);
2675                                         mutex_exit(&net->net_tree_lock);
2676                                         goto is_busy;
2677                                 }
2678                                 mutex_exit(&net->net_cnt_lock);
2679                                 *pmust_unlock = TRUE;
2680                         }
2681                 }
2682         }
2683 
2684         /*
2685          * Only once we have grabbed the lock can we mark what we
2686          * are planning on doing to the ephemeral tree.
2687          */
2688         if (*pmust_unlock) {
2689                 mutex_enter(&net->net_cnt_lock);
2690                 net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
2691 
2692                 /*
2693                  * Check to see if we are nuking the root.
2694                  */
2695                 if (is_derooting)
2696                         net->net_status |=
2697                             NFS4_EPHEMERAL_TREE_DEROOTING;
2698                 mutex_exit(&net->net_cnt_lock);
2699         }
2700 
2701         if (!is_derooting) {
2702                 /*
2703                  * Only work on children if the caller has not already
2704                  * done so.
2705                  */
2706                 if (!is_recursed) {
2707                         ASSERT(eph != NULL);
2708 
2709                         error = nfs4_ephemeral_unmount_engine(eph,
2710                             FALSE, flag, cr);
2711                         if (error)
2712                                 goto is_busy;
2713                 }
2714         } else {
2715                 eph = net->net_root;
2716 
2717                 /*
2718                  * Only work if there is something there.
2719                  */
2720                 if (eph) {
2721                         error = nfs4_ephemeral_unmount_engine(eph, TRUE,
2722                             flag, cr);
2723                         if (error) {
2724                                 mutex_enter(&net->net_cnt_lock);
2725                                 net->net_status &=
2726                                     ~NFS4_EPHEMERAL_TREE_DEROOTING;
2727                                 mutex_exit(&net->net_cnt_lock);
2728                                 goto is_busy;
2729                         }
2730 
2731                         /*
2732                          * Nothing else which goes wrong will
2733                          * invalidate the blowing away of the
2734                          * ephmeral tree.
2735                          */
2736                         net->net_root = NULL;
2737                 }
2738 
2739                 /*
2740                  * We have derooted and we have caused the tree to be
2741                  * invalidated.
2742                  */
2743                 mutex_enter(&net->net_cnt_lock);
2744                 net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
2745                 net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
2746                 DTRACE_NFSV4_1(nfs4clnt__dbg__ephemeral__tree__derooting,
2747                     uint_t, net->net_refcnt);
2748 
2749                 /*
2750                  * We will not finalize this node, so safe to
2751                  * release it.
2752                  */
2753                 nfs4_ephemeral_tree_decr(net);
2754                 mutex_exit(&net->net_cnt_lock);
2755 
2756                 if (was_locked == FALSE)
2757                         mutex_exit(&net->net_tree_lock);
2758 
2759                 /*
2760                  * We have just blown away any notation of this
2761                  * tree being locked or having a refcnt.
2762                  * We can't let the caller try to clean things up.
2763                  */
2764                 *pmust_unlock = FALSE;
2765 
2766                 /*
2767                  * At this point, the tree should no longer be
2768                  * associated with the mntinfo4. We need to pull
2769                  * it off there and let the harvester take
2770                  * care of it once the refcnt drops.
2771                  */
2772                 mutex_enter(&mi->mi_lock);
2773                 mi->mi_ephemeral_tree = NULL;
2774                 mutex_exit(&mi->mi_lock);
2775         }
2776 
2777         return (0);
2778 
2779 is_busy:
2780 
2781         nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2782 
2783         return (error);
2784 }
2785 
2786 /*
2787  * Do the umount and record any error in the parent.
2788  */
2789 static void
2790 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
2791     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
2792 {
2793         int     error;
2794 
2795         /*
2796          * Only act on if the fs is still mounted.
2797          */
2798         if (vfsp == NULL)
2799                 return;
2800 
2801         error = umount2_engine(vfsp, flag, kcred, FALSE);
2802         if (error) {
2803                 if (prior) {
2804                         if (prior->ne_child == e)
2805                                 prior->ne_state |=
2806                                     NFS4_EPHEMERAL_CHILD_ERROR;
2807                         else
2808                                 prior->ne_state |=
2809                                     NFS4_EPHEMERAL_PEER_ERROR;
2810                 }
2811         }
2812 }
2813 
2814 /*
2815  * For each tree in the forest (where the forest is in
2816  * effect all of the ephemeral trees for this zone),
2817  * scan to see if a node can be unmounted. Note that
2818  * unlike nfs4_ephemeral_unmount_engine(), we do
2819  * not process the current node before children or
2820  * siblings. I.e., if a node can be unmounted, we
2821  * do not recursively check to see if the nodes
2822  * hanging off of it can also be unmounted.
2823  *
2824  * Instead, we delve down deep to try and remove the
2825  * children first. Then, because we share code with
2826  * nfs4_ephemeral_unmount_engine(), we will try
2827  * them again. This could be a performance issue in
2828  * the future.
2829  *
2830  * Also note that unlike nfs4_ephemeral_unmount_engine(),
2831  * we do not halt on an error. We will not remove the
2832  * current node, but we will keep on trying to remove
2833  * the others.
2834  *
2835  * force indicates that we want the unmount to occur
2836  * even if there is something blocking it.
2837  *
2838  * time_check indicates that we want to see if the
2839  * mount has expired past mount_to or not. Typically
2840  * we want to do this and only on a shutdown of the
2841  * zone would we want to ignore the check.
2842  */
2843 static void
2844 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
2845     bool_t force, bool_t time_check)
2846 {
2847         nfs4_ephemeral_tree_t   *net;
2848         nfs4_ephemeral_tree_t   *prev = NULL;
2849         nfs4_ephemeral_tree_t   *next;
2850         nfs4_ephemeral_t        *e;
2851         nfs4_ephemeral_t        *prior;
2852         time_t                  now = gethrestime_sec();
2853 
2854         nfs4_ephemeral_tree_t   *harvest = NULL;
2855 
2856         int                     flag;
2857 
2858         mntinfo4_t              *mi;
2859         vfs_t                   *vfsp;
2860 
2861         if (force)
2862                 flag = MS_FORCE | MS_SYSSPACE;
2863         else
2864                 flag = MS_SYSSPACE;
2865 
2866         mutex_enter(&ntg->ntg_forest_lock);
2867         for (net = ntg->ntg_forest; net != NULL; net = next) {
2868                 next = net->net_next;
2869 
2870                 nfs4_ephemeral_tree_hold(net);
2871 
2872                 mutex_enter(&net->net_tree_lock);
2873 
2874                 /*
2875                  * Let the unmount code know that the
2876                  * tree is already locked!
2877                  */
2878                 mutex_enter(&net->net_cnt_lock);
2879                 net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2880                 mutex_exit(&net->net_cnt_lock);
2881 
2882                 /*
2883                  * If the intent is force all ephemeral nodes to
2884                  * be unmounted in this zone, we can short circuit a
2885                  * lot of tree traversal and simply zap the root node.
2886                  */
2887                 if (force) {
2888                         if (net->net_root) {
2889                                 mi = net->net_root->ne_mount;
2890 
2891                                 vfsp = mi->mi_vfsp;
2892                                 ASSERT(vfsp != NULL);
2893 
2894                                 /*
2895                                  * Cleared by umount2_engine.
2896                                  */
2897                                 VFS_HOLD(vfsp);
2898 
2899                                 (void) umount2_engine(vfsp, flag,
2900                                     kcred, FALSE);
2901 
2902                                 goto check_done;
2903                         }
2904                 }
2905 
2906                 e = net->net_root;
2907                 if (e)
2908                         e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2909 
2910                 while (e) {
2911                         if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2912                                 e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2913                                 if (e->ne_child) {
2914                                         e = e->ne_child;
2915                                         e->ne_state =
2916                                             NFS4_EPHEMERAL_VISIT_CHILD;
2917                                 }
2918 
2919                                 continue;
2920                         } else if (e->ne_state ==
2921                             NFS4_EPHEMERAL_VISIT_SIBLING) {
2922                                 e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2923                                 if (e->ne_peer) {
2924                                         e = e->ne_peer;
2925                                         e->ne_state =
2926                                             NFS4_EPHEMERAL_VISIT_CHILD;
2927                                 }
2928 
2929                                 continue;
2930                         } else if (e->ne_state ==
2931                             NFS4_EPHEMERAL_CHILD_ERROR) {
2932                                 prior = e->ne_prior;
2933 
2934                                 /*
2935                                  * If a child reported an error, do
2936                                  * not bother trying to unmount.
2937                                  *
2938                                  * If your prior node is a parent,
2939                                  * pass the error up such that they
2940                                  * also do not try to unmount.
2941                                  *
2942                                  * However, if your prior is a sibling,
2943                                  * let them try to unmount if they can.
2944                                  */
2945                                 if (prior) {
2946                                         if (prior->ne_child == e)
2947                                                 prior->ne_state |=
2948                                                     NFS4_EPHEMERAL_CHILD_ERROR;
2949                                         else
2950                                                 prior->ne_state |=
2951                                                     NFS4_EPHEMERAL_PEER_ERROR;
2952                                 }
2953 
2954                                 /*
2955                                  * Clear the error and if needed, process peers.
2956                                  *
2957                                  * Once we mask out the error, we know whether
2958                                  * or we have to process another node.
2959                                  */
2960                                 e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2961                                 if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2962                                         e = prior;
2963 
2964                                 continue;
2965                         } else if (e->ne_state ==
2966                             NFS4_EPHEMERAL_PEER_ERROR) {
2967                                 prior = e->ne_prior;
2968 
2969                                 if (prior) {
2970                                         if (prior->ne_child == e)
2971                                                 prior->ne_state =
2972                                                     NFS4_EPHEMERAL_CHILD_ERROR;
2973                                         else
2974                                                 prior->ne_state =
2975                                                     NFS4_EPHEMERAL_PEER_ERROR;
2976                                 }
2977 
2978                                 /*
2979                                  * Clear the error from this node and do the
2980                                  * correct processing.
2981                                  */
2982                                 e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2983                                 continue;
2984                         }
2985 
2986                         prior = e->ne_prior;
2987                         e->ne_state = NFS4_EPHEMERAL_OK;
2988 
2989                         /*
2990                          * It must be the case that we need to process
2991                          * this node.
2992                          */
2993                         if (!time_check ||
2994                             now - e->ne_ref_time > e->ne_mount_to) {
2995                                 mi = e->ne_mount;
2996                                 vfsp = mi->mi_vfsp;
2997 
2998                                 /*
2999                                  * Cleared by umount2_engine.
3000                                  */
3001                                 if (vfsp != NULL)
3002                                         VFS_HOLD(vfsp);
3003 
3004                                 /*
3005                                  * Note that we effectively work down to the
3006                                  * leaf nodes first, try to unmount them,
3007                                  * then work our way back up into the leaf
3008                                  * nodes.
3009                                  *
3010                                  * Also note that we deal with a lot of
3011                                  * complexity by sharing the work with
3012                                  * the manual unmount code.
3013                                  */
3014                                 nfs4_ephemeral_record_umount(vfsp, flag,
3015                                     e, prior);
3016                         }
3017 
3018                         e = prior;
3019                 }
3020 
3021 check_done:
3022 
3023                 /*
3024                  * At this point we are done processing this tree.
3025                  *
3026                  * If the tree is invalid and we were the only reference
3027                  * to it, then we push it on the local linked list
3028                  * to remove it at the end. We avoid that action now
3029                  * to keep the tree processing going along at a fair clip.
3030                  *
3031                  * Else, even if we were the only reference, we
3032                  * allow it to be reused as needed.
3033                  */
3034                 mutex_enter(&net->net_cnt_lock);
3035                 nfs4_ephemeral_tree_decr(net);
3036                 if (net->net_refcnt == 0 &&
3037                     net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
3038                         net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3039                         mutex_exit(&net->net_cnt_lock);
3040                         mutex_exit(&net->net_tree_lock);
3041 
3042                         if (prev)
3043                                 prev->net_next = net->net_next;
3044                         else
3045                                 ntg->ntg_forest = net->net_next;
3046 
3047                         net->net_next = harvest;
3048                         harvest = net;
3049 
3050                         VFS_RELE(net->net_mount->mi_vfsp);
3051                         MI4_RELE(net->net_mount);
3052 
3053                         continue;
3054                 }
3055 
3056                 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3057                 mutex_exit(&net->net_cnt_lock);
3058                 mutex_exit(&net->net_tree_lock);
3059 
3060                 prev = net;
3061         }
3062         mutex_exit(&ntg->ntg_forest_lock);
3063 
3064         for (net = harvest; net != NULL; net = next) {
3065                 next = net->net_next;
3066 
3067                 mutex_destroy(&net->net_tree_lock);
3068                 mutex_destroy(&net->net_cnt_lock);
3069                 kmem_free(net, sizeof (*net));
3070         }
3071 }
3072 
3073 /*
3074  * This is the thread which decides when the harvesting
3075  * can proceed and when to kill it off for this zone.
3076  */
3077 static void
3078 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
3079 {
3080         clock_t         timeleft;
3081         zone_t          *zone = curproc->p_zone;
3082 
3083         for (;;) {
3084                 timeleft = zone_status_timedwait(zone, ddi_get_lbolt() +
3085                     nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
3086 
3087                 /*
3088                  * zone is exiting...
3089                  */
3090                 if (timeleft != -1) {
3091                         ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
3092                         zthread_exit();
3093                         /* NOTREACHED */
3094                 }
3095 
3096                 /*
3097                  * Only bother scanning if there is potential
3098                  * work to be done.
3099                  */
3100                 if (ntg->ntg_forest == NULL)
3101                         continue;
3102 
3103                 /*
3104                  * Now scan the list and get rid of everything which
3105                  * is old.
3106                  */
3107                 nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
3108         }
3109 
3110         /* NOTREACHED */
3111 }
3112 
3113 /*
3114  * The zone specific glue needed to start the unmount harvester.
3115  *
3116  * Note that we want to avoid holding the mutex as long as possible,
3117  * hence the multiple checks.
3118  *
3119  * The caller should avoid us getting down here in the first
3120  * place.
3121  */
3122 static void
3123 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
3124 {
3125         /*
3126          * It got started before we got here...
3127          */
3128         if (ntg->ntg_thread_started)
3129                 return;
3130 
3131         mutex_enter(&nfs4_ephemeral_thread_lock);
3132 
3133         if (ntg->ntg_thread_started) {
3134                 mutex_exit(&nfs4_ephemeral_thread_lock);
3135                 return;
3136         }
3137 
3138         /*
3139          * Start the unmounter harvester thread for this zone.
3140          */
3141         (void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
3142             ntg, 0, minclsyspri);
3143 
3144         ntg->ntg_thread_started = TRUE;
3145         mutex_exit(&nfs4_ephemeral_thread_lock);
3146 }
3147 
3148 /*ARGSUSED*/
3149 static void *
3150 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
3151 {
3152         nfs4_trigger_globals_t  *ntg;
3153 
3154         ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
3155         ntg->ntg_thread_started = FALSE;
3156 
3157         /*
3158          * This is the default....
3159          */
3160         ntg->ntg_mount_to = nfs4_trigger_thread_timer;
3161 
3162         mutex_init(&ntg->ntg_forest_lock, NULL,
3163             MUTEX_DEFAULT, NULL);
3164 
3165         return (ntg);
3166 }
3167 
3168 /*
3169  * Try a nice gentle walk down the forest and convince
3170  * all of the trees to gracefully give it up.
3171  */
3172 /*ARGSUSED*/
3173 static void
3174 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
3175 {
3176         nfs4_trigger_globals_t  *ntg = arg;
3177 
3178         if (!ntg)
3179                 return;
3180 
3181         nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
3182 }
3183 
3184 /*
3185  * Race along the forest and rip all of the trees out by
3186  * their rootballs!
3187  */
3188 /*ARGSUSED*/
3189 static void
3190 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
3191 {
3192         nfs4_trigger_globals_t  *ntg = arg;
3193 
3194         if (!ntg)
3195                 return;
3196 
3197         nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
3198 
3199         mutex_destroy(&ntg->ntg_forest_lock);
3200         kmem_free(ntg, sizeof (*ntg));
3201 }
3202 
3203 /*
3204  * This is the zone independent cleanup needed for
3205  * emphemeral mount processing.
3206  */
3207 void
3208 nfs4_ephemeral_fini(void)
3209 {
3210         (void) zone_key_delete(nfs4_ephemeral_key);
3211         mutex_destroy(&nfs4_ephemeral_thread_lock);
3212 }
3213 
3214 /*
3215  * This is the zone independent initialization needed for
3216  * emphemeral mount processing.
3217  */
3218 void
3219 nfs4_ephemeral_init(void)
3220 {
3221         mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
3222             NULL);
3223 
3224         zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
3225             nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
3226 }
3227 
3228 /*
3229  * nfssys() calls this function to set the per-zone
3230  * value of mount_to to drive when an ephemeral mount is
3231  * timed out. Each mount will grab a copy of this value
3232  * when mounted.
3233  */
3234 void
3235 nfs4_ephemeral_set_mount_to(uint_t mount_to)
3236 {
3237         nfs4_trigger_globals_t  *ntg;
3238         zone_t                  *zone = curproc->p_zone;
3239 
3240         ntg = zone_getspecific(nfs4_ephemeral_key, zone);
3241 
3242         ntg->ntg_mount_to = mount_to;
3243 }
3244 
3245 /*
3246  * Walk the list of v4 mount options; if they are currently set in vfsp,
3247  * append them to a new comma-separated mount option string, and return it.
3248  *
3249  * Caller should free by calling nfs4_trigger_destroy_mntopts().
3250  */
3251 static char *
3252 nfs4_trigger_create_mntopts(vfs_t *vfsp)
3253 {
3254         uint_t i;
3255         char *mntopts;
3256         struct vfssw *vswp;
3257         mntopts_t *optproto;
3258 
3259         mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
3260 
3261         /* get the list of applicable mount options for v4; locks *vswp */
3262         vswp = vfs_getvfssw(MNTTYPE_NFS4);
3263         optproto = &vswp->vsw_optproto;
3264 
3265         for (i = 0; i < optproto->mo_count; i++) {
3266                 struct mntopt *mop = &optproto->mo_list[i];
3267 
3268                 if (mop->mo_flags & MO_EMPTY)
3269                         continue;
3270 
3271                 if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
3272                         kmem_free(mntopts, MAX_MNTOPT_STR);
3273                         vfs_unrefvfssw(vswp);
3274                         return (NULL);
3275                 }
3276         }
3277 
3278         vfs_unrefvfssw(vswp);
3279 
3280         /*
3281          * MNTOPT_XATTR is not in the v4 mount opt proto list,
3282          * and it may only be passed via MS_OPTIONSTR, so we
3283          * must handle it here.
3284          *
3285          * Ideally, it would be in the list, but NFS does not specify its
3286          * own opt proto list, it uses instead the default one. Since
3287          * not all filesystems support extended attrs, it would not be
3288          * appropriate to add it there.
3289          */
3290         if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
3291             nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
3292                 kmem_free(mntopts, MAX_MNTOPT_STR);
3293                 return (NULL);
3294         }
3295 
3296         return (mntopts);
3297 }
3298 
3299 static void
3300 nfs4_trigger_destroy_mntopts(char *mntopts)
3301 {
3302         if (mntopts)
3303                 kmem_free(mntopts, MAX_MNTOPT_STR);
3304 }
3305 
3306 /*
3307  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
3308  */
3309 static int
3310 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
3311 {
3312         if (mntopts == NULL || optname == NULL || vfsp == NULL)
3313                 return (EINVAL);
3314 
3315         if (vfs_optionisset(vfsp, optname, NULL)) {
3316                 size_t mntoptslen = strlen(mntopts);
3317                 size_t optnamelen = strlen(optname);
3318 
3319                 /* +1 for ',', +1 for NUL */
3320                 if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
3321                         return (EOVERFLOW);
3322 
3323                 /* first or subsequent mount option? */
3324                 if (*mntopts != '\0')
3325                         (void) strcat(mntopts, ",");
3326 
3327                 (void) strcat(mntopts, optname);
3328         }
3329 
3330         return (0);
3331 }
3332 
3333 static enum clnt_stat
3334 nfs4_ping_server_common(struct knetconfig *knc, struct netbuf *addr, int nointr)
3335 {
3336         int retries;
3337         uint_t max_msgsize;
3338         enum clnt_stat status;
3339         CLIENT *cl;
3340         struct timeval timeout;
3341 
3342         /* as per recov_newserver() */
3343         max_msgsize = 0;
3344         retries = 1;
3345         timeout.tv_sec = 2;
3346         timeout.tv_usec = 0;
3347 
3348         if (clnt_tli_kcreate(knc, addr, NFS_PROGRAM, NFS_V4,
3349             max_msgsize, retries, CRED(), &cl) != 0)
3350                 return (RPC_FAILED);
3351 
3352         if (nointr)
3353                 cl->cl_nosignal = TRUE;
3354         status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
3355             timeout);
3356         if (nointr)
3357                 cl->cl_nosignal = FALSE;
3358 
3359         AUTH_DESTROY(cl->cl_auth);
3360         CLNT_DESTROY(cl);
3361 
3362         return (status);
3363 }
3364 
3365 static enum clnt_stat
3366 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
3367 {
3368         return (nfs4_ping_server_common(svp->sv_knconf, &svp->sv_addr, nointr));
3369 }