1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
  28  *      All rights reserved.
  29  */
  30 
  31 /*
  32  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  33  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  34  */
  35 
  36 #include <sys/param.h>
  37 #include <sys/types.h>
  38 #include <sys/systm.h>
  39 #include <sys/cred.h>
  40 #include <sys/time.h>
  41 #include <sys/vnode.h>
  42 #include <sys/vfs.h>
  43 #include <sys/vfs_opreg.h>
  44 #include <sys/file.h>
  45 #include <sys/filio.h>
  46 #include <sys/uio.h>
  47 #include <sys/buf.h>
  48 #include <sys/mman.h>
  49 #include <sys/pathname.h>
  50 #include <sys/dirent.h>
  51 #include <sys/debug.h>
  52 #include <sys/vmsystm.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/flock.h>
  55 #include <sys/swap.h>
  56 #include <sys/errno.h>
  57 #include <sys/strsubr.h>
  58 #include <sys/sysmacros.h>
  59 #include <sys/kmem.h>
  60 #include <sys/cmn_err.h>
  61 #include <sys/pathconf.h>
  62 #include <sys/utsname.h>
  63 #include <sys/dnlc.h>
  64 #include <sys/acl.h>
  65 #include <sys/systeminfo.h>
  66 #include <sys/atomic.h>
  67 #include <sys/policy.h>
  68 #include <sys/sdt.h>
  69 #include <sys/zone.h>
  70 
  71 #include <rpc/types.h>
  72 #include <rpc/auth.h>
  73 #include <rpc/clnt.h>
  74 #include <rpc/rpc_rdma.h>
  75 
  76 #include <nfs/nfs.h>
  77 #include <nfs/nfs_clnt.h>
  78 #include <nfs/rnode.h>
  79 #include <nfs/nfs_acl.h>
  80 #include <nfs/lm.h>
  81 
  82 #include <vm/hat.h>
  83 #include <vm/as.h>
  84 #include <vm/page.h>
  85 #include <vm/pvn.h>
  86 #include <vm/seg.h>
  87 #include <vm/seg_map.h>
  88 #include <vm/seg_kpm.h>
  89 #include <vm/seg_vn.h>
  90 
  91 #include <fs/fs_subr.h>
  92 
  93 #include <sys/ddi.h>
  94 
  95 static int      nfs3_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
  96                         cred_t *);
  97 static int      nfs3write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
  98                         stable_how *);
  99 static int      nfs3read(vnode_t *, caddr_t, offset_t, int, size_t *, cred_t *);
 100 static int      nfs3setattr(vnode_t *, struct vattr *, int, cred_t *);
 101 static int      nfs3_accessx(void *, int, cred_t *);
 102 static int      nfs3lookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
 103 static int      nfs3lookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
 104 static int      nfs3create(vnode_t *, char *, struct vattr *, enum vcexcl,
 105                         int, vnode_t **, cred_t *, int);
 106 static int      nfs3excl_create_settimes(vnode_t *, struct vattr *, cred_t *);
 107 static int      nfs3mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
 108                         int, vnode_t **, cred_t *);
 109 static int      nfs3rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 110                         caller_context_t *);
 111 static int      do_nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
 112 static void     nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
 113 static void     nfs3readdirplus(vnode_t *, rddir_cache *, cred_t *);
 114 static int      nfs3_bio(struct buf *, stable_how *, cred_t *);
 115 static int      nfs3_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
 116                         page_t *[], size_t, struct seg *, caddr_t,
 117                         enum seg_rw, cred_t *);
 118 static void     nfs3_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
 119                         cred_t *);
 120 static int      nfs3_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
 121                         int, cred_t *);
 122 static int      nfs3_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
 123                         int, cred_t *);
 124 static int      nfs3_commit(vnode_t *, offset3, count3, cred_t *);
 125 static void     nfs3_set_mod(vnode_t *);
 126 static void     nfs3_get_commit(vnode_t *);
 127 static void     nfs3_get_commit_range(vnode_t *, u_offset_t, size_t);
 128 static int      nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
 129 static int      nfs3_commit_vp(vnode_t *, u_offset_t, size_t,  cred_t *);
 130 static int      nfs3_sync_commit(vnode_t *, page_t *, offset3, count3,
 131                         cred_t *);
 132 static void     nfs3_async_commit(vnode_t *, page_t *, offset3, count3,
 133                         cred_t *);
 134 static void     nfs3_delmap_callback(struct as *, void *, uint_t);
 135 
 136 /*
 137  * Error flags used to pass information about certain special errors
 138  * which need to be handled specially.
 139  */
 140 #define NFS_EOF                 -98
 141 #define NFS_VERF_MISMATCH       -97
 142 
 143 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
 144 #define ALIGN64(x, ptr, sz)                                             \
 145         x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);           \
 146         if (x) {                                                        \
 147                 x = sizeof (uint64_t) - (x);                            \
 148                 sz -= (x);                                              \
 149                 ptr += (x);                                             \
 150         }
 151 
 152 /*
 153  * These are the vnode ops routines which implement the vnode interface to
 154  * the networked file system.  These routines just take their parameters,
 155  * make them look networkish by putting the right info into interface structs,
 156  * and then calling the appropriate remote routine(s) to do the work.
 157  *
 158  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
 159  * we purge the directory cache relative to that vnode.  This way, the
 160  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
 161  * more details on rnode locking.
 162  */
 163 
 164 static int      nfs3_open(vnode_t **, int, cred_t *, caller_context_t *);
 165 static int      nfs3_close(vnode_t *, int, int, offset_t, cred_t *,
 166                         caller_context_t *);
 167 static int      nfs3_read(vnode_t *, struct uio *, int, cred_t *,
 168                         caller_context_t *);
 169 static int      nfs3_write(vnode_t *, struct uio *, int, cred_t *,
 170                         caller_context_t *);
 171 static int      nfs3_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
 172                         caller_context_t *);
 173 static int      nfs3_getattr(vnode_t *, struct vattr *, int, cred_t *,
 174                         caller_context_t *);
 175 static int      nfs3_setattr(vnode_t *, struct vattr *, int, cred_t *,
 176                         caller_context_t *);
 177 static int      nfs3_access(vnode_t *, int, int, cred_t *, caller_context_t *);
 178 static int      nfs3_readlink(vnode_t *, struct uio *, cred_t *,
 179                         caller_context_t *);
 180 static int      nfs3_fsync(vnode_t *, int, cred_t *, caller_context_t *);
 181 static void     nfs3_inactive(vnode_t *, cred_t *, caller_context_t *);
 182 static int      nfs3_lookup(vnode_t *, char *, vnode_t **,
 183                         struct pathname *, int, vnode_t *, cred_t *,
 184                         caller_context_t *, int *, pathname_t *);
 185 static int      nfs3_create(vnode_t *, char *, struct vattr *, enum vcexcl,
 186                         int, vnode_t **, cred_t *, int, caller_context_t *,
 187                         vsecattr_t *);
 188 static int      nfs3_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 189                         int);
 190 static int      nfs3_link(vnode_t *, vnode_t *, char *, cred_t *,
 191                         caller_context_t *, int);
 192 static int      nfs3_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 193                         caller_context_t *, int);
 194 static int      nfs3_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
 195                         cred_t *, caller_context_t *, int, vsecattr_t *);
 196 static int      nfs3_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 197                         caller_context_t *, int);
 198 static int      nfs3_symlink(vnode_t *, char *, struct vattr *, char *,
 199                         cred_t *, caller_context_t *, int);
 200 static int      nfs3_readdir(vnode_t *, struct uio *, cred_t *, int *,
 201                         caller_context_t *, int);
 202 static int      nfs3_fid(vnode_t *, fid_t *, caller_context_t *);
 203 static int      nfs3_rwlock(vnode_t *, int, caller_context_t *);
 204 static void     nfs3_rwunlock(vnode_t *, int, caller_context_t *);
 205 static int      nfs3_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
 206 static int      nfs3_getpage(vnode_t *, offset_t, size_t, uint_t *,
 207                         page_t *[], size_t, struct seg *, caddr_t,
 208                         enum seg_rw, cred_t *, caller_context_t *);
 209 static int      nfs3_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
 210                         caller_context_t *);
 211 static int      nfs3_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
 212                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 213 static int      nfs3_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 214                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 215 static int      nfs3_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
 216                         struct flk_callback *, cred_t *, caller_context_t *);
 217 static int      nfs3_space(vnode_t *, int, struct flock64 *, int, offset_t,
 218                         cred_t *, caller_context_t *);
 219 static int      nfs3_realvp(vnode_t *, vnode_t **, caller_context_t *);
 220 static int      nfs3_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 221                         uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
 222 static int      nfs3_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 223                         caller_context_t *);
 224 static int      nfs3_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
 225                         cred_t *, caller_context_t *);
 226 static void     nfs3_dispose(vnode_t *, page_t *, int, int, cred_t *,
 227                         caller_context_t *);
 228 static int      nfs3_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 229                         caller_context_t *);
 230 static int      nfs3_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 231                         caller_context_t *);
 232 static int      nfs3_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 233                         caller_context_t *);
 234 
 235 struct vnodeops *nfs3_vnodeops;
 236 
 237 const fs_operation_def_t nfs3_vnodeops_template[] = {
 238         VOPNAME_OPEN,           { .vop_open = nfs3_open },
 239         VOPNAME_CLOSE,          { .vop_close = nfs3_close },
 240         VOPNAME_READ,           { .vop_read = nfs3_read },
 241         VOPNAME_WRITE,          { .vop_write = nfs3_write },
 242         VOPNAME_IOCTL,          { .vop_ioctl = nfs3_ioctl },
 243         VOPNAME_GETATTR,        { .vop_getattr = nfs3_getattr },
 244         VOPNAME_SETATTR,        { .vop_setattr = nfs3_setattr },
 245         VOPNAME_ACCESS,         { .vop_access = nfs3_access },
 246         VOPNAME_LOOKUP,         { .vop_lookup = nfs3_lookup },
 247         VOPNAME_CREATE,         { .vop_create = nfs3_create },
 248         VOPNAME_REMOVE,         { .vop_remove = nfs3_remove },
 249         VOPNAME_LINK,           { .vop_link = nfs3_link },
 250         VOPNAME_RENAME,         { .vop_rename = nfs3_rename },
 251         VOPNAME_MKDIR,          { .vop_mkdir = nfs3_mkdir },
 252         VOPNAME_RMDIR,          { .vop_rmdir = nfs3_rmdir },
 253         VOPNAME_READDIR,        { .vop_readdir = nfs3_readdir },
 254         VOPNAME_SYMLINK,        { .vop_symlink = nfs3_symlink },
 255         VOPNAME_READLINK,       { .vop_readlink = nfs3_readlink },
 256         VOPNAME_FSYNC,          { .vop_fsync = nfs3_fsync },
 257         VOPNAME_INACTIVE,       { .vop_inactive = nfs3_inactive },
 258         VOPNAME_FID,            { .vop_fid = nfs3_fid },
 259         VOPNAME_RWLOCK,         { .vop_rwlock = nfs3_rwlock },
 260         VOPNAME_RWUNLOCK,       { .vop_rwunlock = nfs3_rwunlock },
 261         VOPNAME_SEEK,           { .vop_seek = nfs3_seek },
 262         VOPNAME_FRLOCK,         { .vop_frlock = nfs3_frlock },
 263         VOPNAME_SPACE,          { .vop_space = nfs3_space },
 264         VOPNAME_REALVP,         { .vop_realvp = nfs3_realvp },
 265         VOPNAME_GETPAGE,        { .vop_getpage = nfs3_getpage },
 266         VOPNAME_PUTPAGE,        { .vop_putpage = nfs3_putpage },
 267         VOPNAME_MAP,            { .vop_map = nfs3_map },
 268         VOPNAME_ADDMAP,         { .vop_addmap = nfs3_addmap },
 269         VOPNAME_DELMAP,         { .vop_delmap = nfs3_delmap },
 270         /* no separate nfs3_dump */
 271         VOPNAME_DUMP,           { .vop_dump = nfs_dump },
 272         VOPNAME_PATHCONF,       { .vop_pathconf = nfs3_pathconf },
 273         VOPNAME_PAGEIO,         { .vop_pageio = nfs3_pageio },
 274         VOPNAME_DISPOSE,        { .vop_dispose = nfs3_dispose },
 275         VOPNAME_SETSECATTR,     { .vop_setsecattr = nfs3_setsecattr },
 276         VOPNAME_GETSECATTR,     { .vop_getsecattr = nfs3_getsecattr },
 277         VOPNAME_SHRLOCK,        { .vop_shrlock = nfs3_shrlock },
 278         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 279         NULL,                   NULL
 280 };
 281 
 282 /*
 283  * XXX:  This is referenced in modstubs.s
 284  */
 285 struct vnodeops *
 286 nfs3_getvnodeops(void)
 287 {
 288         return (nfs3_vnodeops);
 289 }
 290 
 291 /* ARGSUSED */
 292 static int
 293 nfs3_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 294 {
 295         int error;
 296         struct vattr va;
 297         rnode_t *rp;
 298         vnode_t *vp;
 299 
 300         vp = *vpp;
 301         if (nfs_zone() != VTOMI(vp)->mi_zone)
 302                 return (EIO);
 303         rp = VTOR(vp);
 304         mutex_enter(&rp->r_statelock);
 305         if (rp->r_cred == NULL) {
 306                 crhold(cr);
 307                 rp->r_cred = cr;
 308         }
 309         mutex_exit(&rp->r_statelock);
 310 
 311         /*
 312          * If there is no cached data or if close-to-open
 313          * consistency checking is turned off, we can avoid
 314          * the over the wire getattr.  Otherwise, if the
 315          * file system is mounted readonly, then just verify
 316          * the caches are up to date using the normal mechanism.
 317          * Else, if the file is not mmap'd, then just mark
 318          * the attributes as timed out.  They will be refreshed
 319          * and the caches validated prior to being used.
 320          * Else, the file system is mounted writeable so
 321          * force an over the wire GETATTR in order to ensure
 322          * that all cached data is valid.
 323          */
 324         if (vp->v_count > 1 ||
 325             ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
 326             !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
 327                 if (vn_is_readonly(vp))
 328                         error = nfs3_validate_caches(vp, cr);
 329                 else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
 330                         PURGE_ATTRCACHE(vp);
 331                         error = 0;
 332                 } else {
 333                         va.va_mask = AT_ALL;
 334                         error = nfs3_getattr_otw(vp, &va, cr);
 335                 }
 336         } else
 337                 error = 0;
 338 
 339         return (error);
 340 }
 341 
 342 /* ARGSUSED */
 343 static int
 344 nfs3_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 345                 caller_context_t *ct)
 346 {
 347         rnode_t *rp;
 348         int error;
 349         struct vattr va;
 350 
 351         /*
 352          * zone_enter(2) prevents processes from changing zones with NFS files
 353          * open; if we happen to get here from the wrong zone we can't do
 354          * anything over the wire.
 355          */
 356         if (VTOMI(vp)->mi_zone != nfs_zone()) {
 357                 /*
 358                  * We could attempt to clean up locks, except we're sure
 359                  * that the current process didn't acquire any locks on
 360                  * the file: any attempt to lock a file belong to another zone
 361                  * will fail, and one can't lock an NFS file and then change
 362                  * zones, as that fails too.
 363                  *
 364                  * Returning an error here is the sane thing to do.  A
 365                  * subsequent call to VN_RELE() which translates to a
 366                  * nfs3_inactive() will clean up state: if the zone of the
 367                  * vnode's origin is still alive and kicking, an async worker
 368                  * thread will handle the request (from the correct zone), and
 369                  * everything (minus the commit and final nfs3_getattr_otw()
 370                  * call) should be OK. If the zone is going away
 371                  * nfs_async_inactive() will throw away cached pages inline.
 372                  */
 373                 return (EIO);
 374         }
 375 
 376         /*
 377          * If we are using local locking for this filesystem, then
 378          * release all of the SYSV style record locks.  Otherwise,
 379          * we are doing network locking and we need to release all
 380          * of the network locks.  All of the locks held by this
 381          * process on this file are released no matter what the
 382          * incoming reference count is.
 383          */
 384         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
 385                 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 386                 cleanshares(vp, ttoproc(curthread)->p_pid);
 387         } else
 388                 nfs_lockrelease(vp, flag, offset, cr);
 389 
 390         if (count > 1)
 391                 return (0);
 392 
 393         /*
 394          * If the file has been `unlinked', then purge the
 395          * DNLC so that this vnode will get reycled quicker
 396          * and the .nfs* file on the server will get removed.
 397          */
 398         rp = VTOR(vp);
 399         if (rp->r_unldvp != NULL)
 400                 dnlc_purge_vp(vp);
 401 
 402         /*
 403          * If the file was open for write and there are pages,
 404          * then if the file system was mounted using the "no-close-
 405          *      to-open" semantics, then start an asynchronous flush
 406          *      of the all of the pages in the file.
 407          * else the file system was not mounted using the "no-close-
 408          *      to-open" semantics, then do a synchronous flush and
 409          *      commit of all of the dirty and uncommitted pages.
 410          *
 411          * The asynchronous flush of the pages in the "nocto" path
 412          * mostly just associates a cred pointer with the rnode so
 413          * writes which happen later will have a better chance of
 414          * working.  It also starts the data being written to the
 415          * server, but without unnecessarily delaying the application.
 416          */
 417         if ((flag & FWRITE) && vn_has_cached_data(vp)) {
 418                 if (VTOMI(vp)->mi_flags & MI_NOCTO) {
 419                         error = nfs3_putpage(vp, (offset_t)0, 0, B_ASYNC,
 420                             cr, ct);
 421                         if (error == EAGAIN)
 422                                 error = 0;
 423                 } else
 424                         error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
 425                 if (!error) {
 426                         mutex_enter(&rp->r_statelock);
 427                         error = rp->r_error;
 428                         rp->r_error = 0;
 429                         mutex_exit(&rp->r_statelock);
 430                 }
 431         } else {
 432                 mutex_enter(&rp->r_statelock);
 433                 error = rp->r_error;
 434                 rp->r_error = 0;
 435                 mutex_exit(&rp->r_statelock);
 436         }
 437 
 438         /*
 439          * If RWRITEATTR is set, then issue an over the wire GETATTR to
 440          * refresh the attribute cache with a set of attributes which
 441          * weren't returned from a WRITE.  This will enable the close-
 442          * to-open processing to work.
 443          */
 444         if (rp->r_flags & RWRITEATTR)
 445                 (void) nfs3_getattr_otw(vp, &va, cr);
 446 
 447         return (error);
 448 }
 449 
 450 /* ARGSUSED */
 451 static int
 452 nfs3_directio_read(vnode_t *vp, struct uio *uiop, cred_t *cr)
 453 {
 454         mntinfo_t *mi;
 455         READ3args args;
 456         READ3uiores res;
 457         int tsize;
 458         offset_t offset;
 459         ssize_t count;
 460         int error;
 461         int douprintf;
 462         failinfo_t fi;
 463         char *sv_hostname;
 464 
 465         mi = VTOMI(vp);
 466         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 467         sv_hostname = VTOR(vp)->r_server->sv_hostname;
 468 
 469         douprintf = 1;
 470         args.file = *VTOFH3(vp);
 471         fi.vp = vp;
 472         fi.fhp = (caddr_t)&args.file;
 473         fi.copyproc = nfs3copyfh;
 474         fi.lookupproc = nfs3lookup;
 475         fi.xattrdirproc = acl_getxattrdir3;
 476 
 477         res.uiop = uiop;
 478 
 479         res.wlist = NULL;
 480 
 481         offset = uiop->uio_loffset;
 482         count = uiop->uio_resid;
 483 
 484         do {
 485                 if (mi->mi_io_kstats) {
 486                         mutex_enter(&mi->mi_lock);
 487                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
 488                         mutex_exit(&mi->mi_lock);
 489                 }
 490 
 491                 do {
 492                         tsize = MIN(mi->mi_tsize, count);
 493                         args.offset = (offset3)offset;
 494                         args.count = (count3)tsize;
 495                         res.size = (uint_t)tsize;
 496                         args.res_uiop = uiop;
 497                         args.res_data_val_alt = NULL;
 498 
 499                         error = rfs3call(mi, NFSPROC3_READ,
 500                             xdr_READ3args, (caddr_t)&args,
 501                             xdr_READ3uiores, (caddr_t)&res, cr,
 502                             &douprintf, &res.status, 0, &fi);
 503                 } while (error == ENFS_TRYAGAIN);
 504 
 505                 if (mi->mi_io_kstats) {
 506                         mutex_enter(&mi->mi_lock);
 507                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
 508                         mutex_exit(&mi->mi_lock);
 509                 }
 510 
 511                 if (error)
 512                         return (error);
 513 
 514                 error = geterrno3(res.status);
 515                 if (error)
 516                         return (error);
 517 
 518                 if (res.count != res.size) {
 519                         zcmn_err(getzoneid(), CE_WARN,
 520 "nfs3_directio_read: server %s returned incorrect amount",
 521                             sv_hostname);
 522                         return (EIO);
 523                 }
 524                 count -= res.count;
 525                 offset += res.count;
 526                 if (mi->mi_io_kstats) {
 527                         mutex_enter(&mi->mi_lock);
 528                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
 529                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
 530                         mutex_exit(&mi->mi_lock);
 531                 }
 532                 lwp_stat_update(LWP_STAT_INBLK, 1);
 533         } while (count && !res.eof);
 534 
 535         return (0);
 536 }
 537 
 538 /* ARGSUSED */
 539 static int
 540 nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 541         caller_context_t *ct)
 542 {
 543         rnode_t *rp;
 544         u_offset_t off;
 545         offset_t diff;
 546         int on;
 547         size_t n;
 548         caddr_t base;
 549         uint_t flags;
 550         int error = 0;
 551         mntinfo_t *mi;
 552 
 553         rp = VTOR(vp);
 554         mi = VTOMI(vp);
 555 
 556         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
 557 
 558         if (nfs_zone() != mi->mi_zone)
 559                 return (EIO);
 560 
 561         if (vp->v_type != VREG)
 562                 return (EISDIR);
 563 
 564         if (uiop->uio_resid == 0)
 565                 return (0);
 566 
 567         if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
 568                 return (EINVAL);
 569 
 570         /*
 571          * Bypass VM if caching has been disabled (e.g., locking) or if
 572          * using client-side direct I/O and the file is not mmap'd and
 573          * there are no cached pages.
 574          */
 575         if ((vp->v_flag & VNOCACHE) ||
 576             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 577             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 578             !vn_has_cached_data(vp))) {
 579                 return (nfs3_directio_read(vp, uiop, cr));
 580         }
 581 
 582         do {
 583                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 584                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 585                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 586 
 587                 error = nfs3_validate_caches(vp, cr);
 588                 if (error)
 589                         break;
 590 
 591                 mutex_enter(&rp->r_statelock);
 592                 while (rp->r_flags & RINCACHEPURGE) {
 593                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 594                                 mutex_exit(&rp->r_statelock);
 595                                 return (EINTR);
 596                         }
 597                 }
 598                 diff = rp->r_size - uiop->uio_loffset;
 599                 mutex_exit(&rp->r_statelock);
 600                 if (diff <= 0)
 601                         break;
 602                 if (diff < n)
 603                         n = (size_t)diff;
 604 
 605                 if (vpm_enable) {
 606                         /*
 607                          * Copy data.
 608                          */
 609                         error = vpm_data_copy(vp, off + on, n, uiop,
 610                             1, NULL, 0, S_READ);
 611                 } else {
 612                         base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
 613                             S_READ);
 614 
 615                         error = uiomove(base + on, n, UIO_READ, uiop);
 616                 }
 617 
 618                 if (!error) {
 619                         /*
 620                          * If read a whole block or read to eof,
 621                          * won't need this buffer again soon.
 622                          */
 623                         mutex_enter(&rp->r_statelock);
 624                         if (n + on == MAXBSIZE ||
 625                             uiop->uio_loffset == rp->r_size)
 626                                 flags = SM_DONTNEED;
 627                         else
 628                                 flags = 0;
 629                         mutex_exit(&rp->r_statelock);
 630                         if (vpm_enable) {
 631                                 error = vpm_sync_pages(vp, off, n, flags);
 632                         } else {
 633                                 error = segmap_release(segkmap, base, flags);
 634                         }
 635                 } else {
 636                         if (vpm_enable) {
 637                                 (void) vpm_sync_pages(vp, off, n, 0);
 638                         } else {
 639                                 (void) segmap_release(segkmap, base, 0);
 640                         }
 641                 }
 642         } while (!error && uiop->uio_resid > 0);
 643 
 644         return (error);
 645 }
 646 
 647 /* ARGSUSED */
 648 static int
 649 nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 650         caller_context_t *ct)
 651 {
 652         rlim64_t limit = uiop->uio_llimit;
 653         rnode_t *rp;
 654         u_offset_t off;
 655         caddr_t base;
 656         uint_t flags;
 657         int remainder;
 658         size_t n;
 659         int on;
 660         int error;
 661         int resid;
 662         offset_t offset;
 663         mntinfo_t *mi;
 664         uint_t bsize;
 665 
 666         rp = VTOR(vp);
 667 
 668         if (vp->v_type != VREG)
 669                 return (EISDIR);
 670 
 671         mi = VTOMI(vp);
 672         if (nfs_zone() != mi->mi_zone)
 673                 return (EIO);
 674         if (uiop->uio_resid == 0)
 675                 return (0);
 676 
 677         if (ioflag & FAPPEND) {
 678                 struct vattr va;
 679 
 680                 /*
 681                  * Must serialize if appending.
 682                  */
 683                 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
 684                         nfs_rw_exit(&rp->r_rwlock);
 685                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
 686                             INTR(vp)))
 687                                 return (EINTR);
 688                 }
 689 
 690                 va.va_mask = AT_SIZE;
 691                 error = nfs3getattr(vp, &va, cr);
 692                 if (error)
 693                         return (error);
 694                 uiop->uio_loffset = va.va_size;
 695         }
 696 
 697         offset = uiop->uio_loffset + uiop->uio_resid;
 698 
 699         if (uiop->uio_loffset < 0 || offset < 0)
 700                 return (EINVAL);
 701 
 702         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 703                 limit = MAXOFFSET_T;
 704 
 705         /*
 706          * Check to make sure that the process will not exceed
 707          * its limit on file size.  It is okay to write up to
 708          * the limit, but not beyond.  Thus, the write which
 709          * reaches the limit will be short and the next write
 710          * will return an error.
 711          */
 712         remainder = 0;
 713         if (offset > limit) {
 714                 remainder = offset - limit;
 715                 uiop->uio_resid = limit - uiop->uio_loffset;
 716                 if (uiop->uio_resid <= 0) {
 717                         proc_t *p = ttoproc(curthread);
 718 
 719                         uiop->uio_resid += remainder;
 720                         mutex_enter(&p->p_lock);
 721                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 722                             p->p_rctls, p, RCA_UNSAFE_SIGINFO);
 723                         mutex_exit(&p->p_lock);
 724                         return (EFBIG);
 725                 }
 726         }
 727 
 728         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
 729                 return (EINTR);
 730 
 731         /*
 732          * Bypass VM if caching has been disabled (e.g., locking) or if
 733          * using client-side direct I/O and the file is not mmap'd and
 734          * there are no cached pages.
 735          */
 736         if ((vp->v_flag & VNOCACHE) ||
 737             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 738             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 739             !vn_has_cached_data(vp))) {
 740                 size_t bufsize;
 741                 int count;
 742                 u_offset_t org_offset;
 743                 stable_how stab_comm;
 744 
 745 nfs3_fwrite:
 746                 if (rp->r_flags & RSTALE) {
 747                         resid = uiop->uio_resid;
 748                         offset = uiop->uio_loffset;
 749                         error = rp->r_error;
 750                         /*
 751                          * A close may have cleared r_error, if so,
 752                          * propagate ESTALE error return properly
 753                          */
 754                         if (error == 0)
 755                                 error = ESTALE;
 756                         goto bottom;
 757                 }
 758                 bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
 759                 base = kmem_alloc(bufsize, KM_SLEEP);
 760                 do {
 761                         if (ioflag & FDSYNC)
 762                                 stab_comm = DATA_SYNC;
 763                         else
 764                                 stab_comm = FILE_SYNC;
 765                         resid = uiop->uio_resid;
 766                         offset = uiop->uio_loffset;
 767                         count = MIN(uiop->uio_resid, bufsize);
 768                         org_offset = uiop->uio_loffset;
 769                         error = uiomove(base, count, UIO_WRITE, uiop);
 770                         if (!error) {
 771                                 error = nfs3write(vp, base, org_offset,
 772                                     count, cr, &stab_comm);
 773                         }
 774                 } while (!error && uiop->uio_resid > 0);
 775                 kmem_free(base, bufsize);
 776                 goto bottom;
 777         }
 778 
 779 
 780         bsize = vp->v_vfsp->vfs_bsize;
 781 
 782         do {
 783                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 784                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 785                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 786 
 787                 resid = uiop->uio_resid;
 788                 offset = uiop->uio_loffset;
 789 
 790                 if (rp->r_flags & RSTALE) {
 791                         error = rp->r_error;
 792                         /*
 793                          * A close may have cleared r_error, if so,
 794                          * propagate ESTALE error return properly
 795                          */
 796                         if (error == 0)
 797                                 error = ESTALE;
 798                         break;
 799                 }
 800 
 801                 /*
 802                  * Don't create dirty pages faster than they
 803                  * can be cleaned so that the system doesn't
 804                  * get imbalanced.  If the async queue is
 805                  * maxed out, then wait for it to drain before
 806                  * creating more dirty pages.  Also, wait for
 807                  * any threads doing pagewalks in the vop_getattr
 808                  * entry points so that they don't block for
 809                  * long periods.
 810                  */
 811                 mutex_enter(&rp->r_statelock);
 812                 while ((mi->mi_max_threads != 0 &&
 813                     rp->r_awcount > 2 * mi->mi_max_threads) ||
 814                     rp->r_gcount > 0) {
 815                         if (INTR(vp)) {
 816                                 klwp_t *lwp = ttolwp(curthread);
 817 
 818                                 if (lwp != NULL)
 819                                         lwp->lwp_nostop++;
 820                                 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 821                                         mutex_exit(&rp->r_statelock);
 822                                         if (lwp != NULL)
 823                                                 lwp->lwp_nostop--;
 824                                         error = EINTR;
 825                                         goto bottom;
 826                                 }
 827                                 if (lwp != NULL)
 828                                         lwp->lwp_nostop--;
 829                         } else
 830                                 cv_wait(&rp->r_cv, &rp->r_statelock);
 831                 }
 832                 mutex_exit(&rp->r_statelock);
 833 
 834                 /*
 835                  * Touch the page and fault it in if it is not in core
 836                  * before segmap_getmapflt or vpm_data_copy can lock it.
 837                  * This is to avoid the deadlock if the buffer is mapped
 838                  * to the same file through mmap which we want to write.
 839                  */
 840                 uio_prefaultpages((long)n, uiop);
 841 
 842                 if (vpm_enable) {
 843                         /*
 844                          * It will use kpm mappings, so no need to
 845                          * pass an address.
 846                          */
 847                         error = writerp(rp, NULL, n, uiop, 0);
 848                 } else  {
 849                         if (segmap_kpm) {
 850                                 int pon = uiop->uio_loffset & PAGEOFFSET;
 851                                 size_t pn = MIN(PAGESIZE - pon,
 852                                     uiop->uio_resid);
 853                                 int pagecreate;
 854 
 855                                 mutex_enter(&rp->r_statelock);
 856                                 pagecreate = (pon == 0) && (pn == PAGESIZE ||
 857                                     uiop->uio_loffset + pn >= rp->r_size);
 858                                 mutex_exit(&rp->r_statelock);
 859 
 860                                 base = segmap_getmapflt(segkmap, vp, off + on,
 861                                     pn, !pagecreate, S_WRITE);
 862 
 863                                 error = writerp(rp, base + pon, n, uiop,
 864                                     pagecreate);
 865 
 866                         } else {
 867                                 base = segmap_getmapflt(segkmap, vp, off + on,
 868                                     n, 0, S_READ);
 869                                 error = writerp(rp, base + on, n, uiop, 0);
 870                         }
 871                 }
 872 
 873                 if (!error) {
 874                         if (mi->mi_flags & MI_NOAC)
 875                                 flags = SM_WRITE;
 876                         else if ((uiop->uio_loffset % bsize) == 0 ||
 877                             IS_SWAPVP(vp)) {
 878                                 /*
 879                                  * Have written a whole block.
 880                                  * Start an asynchronous write
 881                                  * and mark the buffer to
 882                                  * indicate that it won't be
 883                                  * needed again soon.
 884                                  */
 885                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
 886                         } else
 887                                 flags = 0;
 888                         if ((ioflag & (FSYNC|FDSYNC)) ||
 889                             (rp->r_flags & ROUTOFSPACE)) {
 890                                 flags &= ~SM_ASYNC;
 891                                 flags |= SM_WRITE;
 892                         }
 893                         if (vpm_enable) {
 894                                 error = vpm_sync_pages(vp, off, n, flags);
 895                         } else {
 896                                 error = segmap_release(segkmap, base, flags);
 897                         }
 898                 } else {
 899                         if (vpm_enable) {
 900                                 (void) vpm_sync_pages(vp, off, n, 0);
 901                         } else {
 902                                 (void) segmap_release(segkmap, base, 0);
 903                         }
 904                         /*
 905                          * In the event that we got an access error while
 906                          * faulting in a page for a write-only file just
 907                          * force a write.
 908                          */
 909                         if (error == EACCES)
 910                                 goto nfs3_fwrite;
 911                 }
 912         } while (!error && uiop->uio_resid > 0);
 913 
 914 bottom:
 915         if (error) {
 916                 uiop->uio_resid = resid + remainder;
 917                 uiop->uio_loffset = offset;
 918         } else
 919                 uiop->uio_resid += remainder;
 920 
 921         nfs_rw_exit(&rp->r_lkserlock);
 922 
 923         return (error);
 924 }
 925 
 926 /*
 927  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
 928  */
 929 static int
 930 nfs3_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
 931         int flags, cred_t *cr)
 932 {
 933         struct buf *bp;
 934         int error;
 935         page_t *savepp;
 936         uchar_t fsdata;
 937         stable_how stab_comm;
 938 
 939         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 940         bp = pageio_setup(pp, len, vp, flags);
 941         ASSERT(bp != NULL);
 942 
 943         /*
 944          * pageio_setup should have set b_addr to 0.  This
 945          * is correct since we want to do I/O on a page
 946          * boundary.  bp_mapin will use this addr to calculate
 947          * an offset, and then set b_addr to the kernel virtual
 948          * address it allocated for us.
 949          */
 950         ASSERT(bp->b_un.b_addr == 0);
 951 
 952         bp->b_edev = 0;
 953         bp->b_dev = 0;
 954         bp->b_lblkno = lbtodb(off);
 955         bp->b_file = vp;
 956         bp->b_offset = (offset_t)off;
 957         bp_mapin(bp);
 958 
 959         /*
 960          * Calculate the desired level of stability to write data
 961          * on the server and then mark all of the pages to reflect
 962          * this.
 963          */
 964         if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
 965             freemem > desfree) {
 966                 stab_comm = UNSTABLE;
 967                 fsdata = C_DELAYCOMMIT;
 968         } else {
 969                 stab_comm = FILE_SYNC;
 970                 fsdata = C_NOCOMMIT;
 971         }
 972 
 973         savepp = pp;
 974         do {
 975                 pp->p_fsdata = fsdata;
 976         } while ((pp = pp->p_next) != savepp);
 977 
 978         error = nfs3_bio(bp, &stab_comm, cr);
 979 
 980         bp_mapout(bp);
 981         pageio_done(bp);
 982 
 983         /*
 984          * If the server wrote pages in a more stable fashion than
 985          * was requested, then clear all of the marks in the pages
 986          * indicating that COMMIT operations were required.
 987          */
 988         if (stab_comm != UNSTABLE && fsdata == C_DELAYCOMMIT) {
 989                 do {
 990                         pp->p_fsdata = C_NOCOMMIT;
 991                 } while ((pp = pp->p_next) != savepp);
 992         }
 993 
 994         return (error);
 995 }
 996 
 997 /*
 998  * Write to file.  Writes to remote server in largest size
 999  * chunks that the server can handle.  Write is synchronous.
1000  */
1001 static int
1002 nfs3write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
1003         stable_how *stab_comm)
1004 {
1005         mntinfo_t *mi;
1006         WRITE3args args;
1007         WRITE3res res;
1008         int error;
1009         int tsize;
1010         rnode_t *rp;
1011         int douprintf;
1012 
1013         rp = VTOR(vp);
1014         mi = VTOMI(vp);
1015 
1016         ASSERT(nfs_zone() == mi->mi_zone);
1017 
1018         args.file = *VTOFH3(vp);
1019         args.stable = *stab_comm;
1020 
1021         *stab_comm = FILE_SYNC;
1022 
1023         douprintf = 1;
1024 
1025         do {
1026                 if ((vp->v_flag & VNOCACHE) ||
1027                     (rp->r_flags & RDIRECTIO) ||
1028                     (mi->mi_flags & MI_DIRECTIO))
1029                         tsize = MIN(mi->mi_stsize, count);
1030                 else
1031                         tsize = MIN(mi->mi_curwrite, count);
1032                 args.offset = (offset3)offset;
1033                 args.count = (count3)tsize;
1034                 args.data.data_len = (uint_t)tsize;
1035                 args.data.data_val = base;
1036 
1037                 if (mi->mi_io_kstats) {
1038                         mutex_enter(&mi->mi_lock);
1039                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1040                         mutex_exit(&mi->mi_lock);
1041                 }
1042                 args.mblk = NULL;
1043                 do {
1044                         error = rfs3call(mi, NFSPROC3_WRITE,
1045                             xdr_WRITE3args, (caddr_t)&args,
1046                             xdr_WRITE3res, (caddr_t)&res, cr,
1047                             &douprintf, &res.status, 0, NULL);
1048                 } while (error == ENFS_TRYAGAIN);
1049                 if (mi->mi_io_kstats) {
1050                         mutex_enter(&mi->mi_lock);
1051                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1052                         mutex_exit(&mi->mi_lock);
1053                 }
1054 
1055                 if (error)
1056                         return (error);
1057                 error = geterrno3(res.status);
1058                 if (!error) {
1059                         if (res.resok.count > args.count) {
1060                                 zcmn_err(getzoneid(), CE_WARN,
1061                                     "nfs3write: server %s wrote %u, "
1062                                     "requested was %u",
1063                                     rp->r_server->sv_hostname,
1064                                     res.resok.count, args.count);
1065                                 return (EIO);
1066                         }
1067                         if (res.resok.committed == UNSTABLE) {
1068                                 *stab_comm = UNSTABLE;
1069                                 if (args.stable == DATA_SYNC ||
1070                                     args.stable == FILE_SYNC) {
1071                                         zcmn_err(getzoneid(), CE_WARN,
1072                         "nfs3write: server %s did not commit to stable storage",
1073                                             rp->r_server->sv_hostname);
1074                                         return (EIO);
1075                                 }
1076                         }
1077                         tsize = (int)res.resok.count;
1078                         count -= tsize;
1079                         base += tsize;
1080                         offset += tsize;
1081                         if (mi->mi_io_kstats) {
1082                                 mutex_enter(&mi->mi_lock);
1083                                 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
1084                                 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
1085                                     tsize;
1086                                 mutex_exit(&mi->mi_lock);
1087                         }
1088                         lwp_stat_update(LWP_STAT_OUBLK, 1);
1089                         mutex_enter(&rp->r_statelock);
1090                         if (rp->r_flags & RHAVEVERF) {
1091                                 if (rp->r_verf != res.resok.verf) {
1092                                         nfs3_set_mod(vp);
1093                                         rp->r_verf = res.resok.verf;
1094                                         /*
1095                                          * If the data was written UNSTABLE,
1096                                          * then might as well stop because
1097                                          * the whole block will have to get
1098                                          * rewritten anyway.
1099                                          */
1100                                         if (*stab_comm == UNSTABLE) {
1101                                                 mutex_exit(&rp->r_statelock);
1102                                                 break;
1103                                         }
1104                                 }
1105                         } else {
1106                                 rp->r_verf = res.resok.verf;
1107                                 rp->r_flags |= RHAVEVERF;
1108                         }
1109                         /*
1110                          * Mark the attribute cache as timed out and
1111                          * set RWRITEATTR to indicate that the file
1112                          * was modified with a WRITE operation and
1113                          * that the attributes can not be trusted.
1114                          */
1115                         PURGE_ATTRCACHE_LOCKED(rp);
1116                         rp->r_flags |= RWRITEATTR;
1117                         mutex_exit(&rp->r_statelock);
1118                 }
1119         } while (!error && count);
1120 
1121         return (error);
1122 }
1123 
1124 /*
1125  * Read from a file.  Reads data in largest chunks our interface can handle.
1126  */
1127 static int
1128 nfs3read(vnode_t *vp, caddr_t base, offset_t offset, int count,
1129         size_t *residp, cred_t *cr)
1130 {
1131         mntinfo_t *mi;
1132         READ3args args;
1133         READ3vres res;
1134         int tsize;
1135         int error;
1136         int douprintf;
1137         failinfo_t fi;
1138         rnode_t *rp;
1139         struct vattr va;
1140         hrtime_t t;
1141 
1142         rp = VTOR(vp);
1143         mi = VTOMI(vp);
1144         ASSERT(nfs_zone() == mi->mi_zone);
1145         douprintf = 1;
1146 
1147         args.file = *VTOFH3(vp);
1148         fi.vp = vp;
1149         fi.fhp = (caddr_t)&args.file;
1150         fi.copyproc = nfs3copyfh;
1151         fi.lookupproc = nfs3lookup;
1152         fi.xattrdirproc = acl_getxattrdir3;
1153 
1154         res.pov.fres.vp = vp;
1155         res.pov.fres.vap = &va;
1156 
1157         res.wlist = NULL;
1158         *residp = count;
1159         do {
1160                 if (mi->mi_io_kstats) {
1161                         mutex_enter(&mi->mi_lock);
1162                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1163                         mutex_exit(&mi->mi_lock);
1164                 }
1165 
1166                 do {
1167                         if ((vp->v_flag & VNOCACHE) ||
1168                             (rp->r_flags & RDIRECTIO) ||
1169                             (mi->mi_flags & MI_DIRECTIO))
1170                                 tsize = MIN(mi->mi_tsize, count);
1171                         else
1172                                 tsize = MIN(mi->mi_curread, count);
1173                         res.data.data_val = base;
1174                         res.data.data_len = tsize;
1175                         args.offset = (offset3)offset;
1176                         args.count = (count3)tsize;
1177                         args.res_uiop = NULL;
1178                         args.res_data_val_alt = base;
1179 
1180                         t = gethrtime();
1181                         error = rfs3call(mi, NFSPROC3_READ,
1182                             xdr_READ3args, (caddr_t)&args,
1183                             xdr_READ3vres, (caddr_t)&res, cr,
1184                             &douprintf, &res.status, 0, &fi);
1185                 } while (error == ENFS_TRYAGAIN);
1186 
1187                 if (mi->mi_io_kstats) {
1188                         mutex_enter(&mi->mi_lock);
1189                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1190                         mutex_exit(&mi->mi_lock);
1191                 }
1192 
1193                 if (error)
1194                         return (error);
1195 
1196                 error = geterrno3(res.status);
1197                 if (error)
1198                         return (error);
1199 
1200                 if (res.count != res.data.data_len) {
1201                         zcmn_err(getzoneid(), CE_WARN,
1202                             "nfs3read: server %s returned incorrect amount",
1203                             rp->r_server->sv_hostname);
1204                         return (EIO);
1205                 }
1206 
1207                 count -= res.count;
1208                 *residp = count;
1209                 base += res.count;
1210                 offset += res.count;
1211                 if (mi->mi_io_kstats) {
1212                         mutex_enter(&mi->mi_lock);
1213                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1214                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
1215                         mutex_exit(&mi->mi_lock);
1216                 }
1217                 lwp_stat_update(LWP_STAT_INBLK, 1);
1218         } while (count && !res.eof);
1219 
1220         if (res.pov.attributes) {
1221                 mutex_enter(&rp->r_statelock);
1222                 if (!CACHE_VALID(rp, va.va_mtime, va.va_size)) {
1223                         mutex_exit(&rp->r_statelock);
1224                         PURGE_ATTRCACHE(vp);
1225                 } else {
1226                         if (rp->r_mtime <= t)
1227                                 nfs_attrcache_va(vp, &va);
1228                         mutex_exit(&rp->r_statelock);
1229                 }
1230         }
1231 
1232         return (0);
1233 }
1234 
1235 /* ARGSUSED */
1236 static int
1237 nfs3_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1238         caller_context_t *ct)
1239 {
1240 
1241         if (nfs_zone() != VTOMI(vp)->mi_zone)
1242                 return (EIO);
1243         switch (cmd) {
1244                 case _FIODIRECTIO:
1245                         return (nfs_directio(vp, (int)arg, cr));
1246                 default:
1247                         return (ENOTTY);
1248         }
1249 }
1250 
1251 /* ARGSUSED */
1252 static int
1253 nfs3_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1254         caller_context_t *ct)
1255 {
1256         int error;
1257         rnode_t *rp;
1258 
1259         if (nfs_zone() != VTOMI(vp)->mi_zone)
1260                 return (EIO);
1261         /*
1262          * If it has been specified that the return value will
1263          * just be used as a hint, and we are only being asked
1264          * for size, fsid or rdevid, then return the client's
1265          * notion of these values without checking to make sure
1266          * that the attribute cache is up to date.
1267          * The whole point is to avoid an over the wire GETATTR
1268          * call.
1269          */
1270         rp = VTOR(vp);
1271         if (flags & ATTR_HINT) {
1272                 if (vap->va_mask ==
1273                     (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1274                         mutex_enter(&rp->r_statelock);
1275                         if (vap->va_mask | AT_SIZE)
1276                                 vap->va_size = rp->r_size;
1277                         if (vap->va_mask | AT_FSID)
1278                                 vap->va_fsid = rp->r_attr.va_fsid;
1279                         if (vap->va_mask | AT_RDEV)
1280                                 vap->va_rdev = rp->r_attr.va_rdev;
1281                         mutex_exit(&rp->r_statelock);
1282                         return (0);
1283                 }
1284         }
1285 
1286         /*
1287          * Only need to flush pages if asking for the mtime
1288          * and if there any dirty pages or any outstanding
1289          * asynchronous (write) requests for this file.
1290          */
1291         if (vap->va_mask & AT_MTIME) {
1292                 if (vn_has_cached_data(vp) &&
1293                     ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1294                         mutex_enter(&rp->r_statelock);
1295                         rp->r_gcount++;
1296                         mutex_exit(&rp->r_statelock);
1297                         error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1298                         mutex_enter(&rp->r_statelock);
1299                         if (error && (error == ENOSPC || error == EDQUOT)) {
1300                                 if (!rp->r_error)
1301                                         rp->r_error = error;
1302                         }
1303                         if (--rp->r_gcount == 0)
1304                                 cv_broadcast(&rp->r_cv);
1305                         mutex_exit(&rp->r_statelock);
1306                 }
1307         }
1308 
1309         return (nfs3getattr(vp, vap, cr));
1310 }
1311 
1312 /*ARGSUSED4*/
1313 static int
1314 nfs3_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1315                 caller_context_t *ct)
1316 {
1317         int error;
1318         struct vattr va;
1319 
1320         if (vap->va_mask & AT_NOSET)
1321                 return (EINVAL);
1322         if (nfs_zone() != VTOMI(vp)->mi_zone)
1323                 return (EIO);
1324 
1325         va.va_mask = AT_UID | AT_MODE;
1326         error = nfs3getattr(vp, &va, cr);
1327         if (error)
1328                 return (error);
1329 
1330         error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs3_accessx,
1331             vp);
1332         if (error)
1333                 return (error);
1334 
1335         error = nfs3setattr(vp, vap, flags, cr);
1336 
1337         if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0)
1338                 vnevent_truncate(vp, ct);
1339 
1340         return (error);
1341 }
1342 
1343 static int
1344 nfs3setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1345 {
1346         int error;
1347         uint_t mask;
1348         SETATTR3args args;
1349         SETATTR3res res;
1350         int douprintf;
1351         rnode_t *rp;
1352         struct vattr va;
1353         mode_t omode;
1354         vsecattr_t *vsp;
1355         hrtime_t t;
1356 
1357         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1358         mask = vap->va_mask;
1359 
1360         rp = VTOR(vp);
1361 
1362         /*
1363          * Only need to flush pages if there are any pages and
1364          * if the file is marked as dirty in some fashion.  The
1365          * file must be flushed so that we can accurately
1366          * determine the size of the file and the cached data
1367          * after the SETATTR returns.  A file is considered to
1368          * be dirty if it is either marked with RDIRTY, has
1369          * outstanding i/o's active, or is mmap'd.  In this
1370          * last case, we can't tell whether there are dirty
1371          * pages, so we flush just to be sure.
1372          */
1373         if (vn_has_cached_data(vp) &&
1374             ((rp->r_flags & RDIRTY) ||
1375             rp->r_count > 0 ||
1376             rp->r_mapcnt > 0)) {
1377                 ASSERT(vp->v_type != VCHR);
1378                 error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1379                 if (error && (error == ENOSPC || error == EDQUOT)) {
1380                         mutex_enter(&rp->r_statelock);
1381                         if (!rp->r_error)
1382                                 rp->r_error = error;
1383                         mutex_exit(&rp->r_statelock);
1384                 }
1385         }
1386 
1387         args.object = *RTOFH3(rp);
1388         /*
1389          * If the intent is for the server to set the times,
1390          * there is no point in have the mask indicating set mtime or
1391          * atime, because the vap values may be junk, and so result
1392          * in an overflow error. Remove these flags from the vap mask
1393          * before calling in this case, and restore them afterwards.
1394          */
1395         if ((mask & (AT_ATIME | AT_MTIME)) && !(flags & ATTR_UTIME)) {
1396                 /* Use server times, so don't set the args time fields */
1397                 vap->va_mask &= ~(AT_ATIME | AT_MTIME);
1398                 error = vattr_to_sattr3(vap, &args.new_attributes);
1399                 vap->va_mask |= (mask & (AT_ATIME | AT_MTIME));
1400                 if (mask & AT_ATIME) {
1401                         args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
1402                 }
1403                 if (mask & AT_MTIME) {
1404                         args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
1405                 }
1406         } else {
1407                 /* Either do not set times or use the client specified times */
1408                 error = vattr_to_sattr3(vap, &args.new_attributes);
1409         }
1410 
1411         if (error) {
1412                 /* req time field(s) overflow - return immediately */
1413                 return (error);
1414         }
1415 
1416         va.va_mask = AT_MODE | AT_CTIME;
1417         error = nfs3getattr(vp, &va, cr);
1418         if (error)
1419                 return (error);
1420         omode = va.va_mode;
1421 
1422 tryagain:
1423         if (mask & AT_SIZE) {
1424                 args.guard.check = TRUE;
1425                 args.guard.obj_ctime.seconds = va.va_ctime.tv_sec;
1426                 args.guard.obj_ctime.nseconds = va.va_ctime.tv_nsec;
1427         } else
1428                 args.guard.check = FALSE;
1429 
1430         douprintf = 1;
1431 
1432         t = gethrtime();
1433 
1434         error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
1435             xdr_SETATTR3args, (caddr_t)&args,
1436             xdr_SETATTR3res, (caddr_t)&res, cr,
1437             &douprintf, &res.status, 0, NULL);
1438 
1439         /*
1440          * Purge the access cache and ACL cache if changing either the
1441          * owner of the file, the group owner, or the mode.  These may
1442          * change the access permissions of the file, so purge old
1443          * information and start over again.
1444          */
1445         if (mask & (AT_UID | AT_GID | AT_MODE)) {
1446                 (void) nfs_access_purge_rp(rp);
1447                 if (rp->r_secattr != NULL) {
1448                         mutex_enter(&rp->r_statelock);
1449                         vsp = rp->r_secattr;
1450                         rp->r_secattr = NULL;
1451                         mutex_exit(&rp->r_statelock);
1452                         if (vsp != NULL)
1453                                 nfs_acl_free(vsp);
1454                 }
1455         }
1456 
1457         if (error) {
1458                 PURGE_ATTRCACHE(vp);
1459                 return (error);
1460         }
1461 
1462         error = geterrno3(res.status);
1463         if (!error) {
1464                 /*
1465                  * If changing the size of the file, invalidate
1466                  * any local cached data which is no longer part
1467                  * of the file.  We also possibly invalidate the
1468                  * last page in the file.  We could use
1469                  * pvn_vpzero(), but this would mark the page as
1470                  * modified and require it to be written back to
1471                  * the server for no particularly good reason.
1472                  * This way, if we access it, then we bring it
1473                  * back in.  A read should be cheaper than a
1474                  * write.
1475                  */
1476                 if (mask & AT_SIZE) {
1477                         nfs_invalidate_pages(vp,
1478                             (vap->va_size & PAGEMASK), cr);
1479                 }
1480                 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
1481                 /*
1482                  * Some servers will change the mode to clear the setuid
1483                  * and setgid bits when changing the uid or gid.  The
1484                  * client needs to compensate appropriately.
1485                  */
1486                 if (mask & (AT_UID | AT_GID)) {
1487                         int terror;
1488 
1489                         va.va_mask = AT_MODE;
1490                         terror = nfs3getattr(vp, &va, cr);
1491                         if (!terror &&
1492                             (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
1493                             (!(mask & AT_MODE) && va.va_mode != omode))) {
1494                                 va.va_mask = AT_MODE;
1495                                 if (mask & AT_MODE)
1496                                         va.va_mode = vap->va_mode;
1497                                 else
1498                                         va.va_mode = omode;
1499                                 (void) nfs3setattr(vp, &va, 0, cr);
1500                         }
1501                 }
1502         } else {
1503                 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
1504                 /*
1505                  * If we got back a "not synchronized" error, then
1506                  * we need to retry with a new guard value.  The
1507                  * guard value used is the change time.  If the
1508                  * server returned post_op_attr, then we can just
1509                  * retry because we have the latest attributes.
1510                  * Otherwise, we issue a GETATTR to get the latest
1511                  * attributes and then retry.  If we couldn't get
1512                  * the attributes this way either, then we give
1513                  * up because we can't complete the operation as
1514                  * required.
1515                  */
1516                 if (res.status == NFS3ERR_NOT_SYNC) {
1517                         va.va_mask = AT_CTIME;
1518                         if (nfs3getattr(vp, &va, cr) == 0)
1519                                 goto tryagain;
1520                 }
1521                 PURGE_STALE_FH(error, vp, cr);
1522         }
1523 
1524         return (error);
1525 }
1526 
1527 static int
1528 nfs3_accessx(void *vp, int mode, cred_t *cr)
1529 {
1530         ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1531         return (nfs3_access(vp, mode, 0, cr, NULL));
1532 }
1533 
1534 /* ARGSUSED */
1535 static int
1536 nfs3_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1537 {
1538         int error;
1539         ACCESS3args args;
1540         ACCESS3res res;
1541         int douprintf;
1542         uint32 acc;
1543         rnode_t *rp;
1544         cred_t *cred, *ncr, *ncrfree = NULL;
1545         failinfo_t fi;
1546         nfs_access_type_t cacc;
1547         hrtime_t t;
1548 
1549         acc = 0;
1550         if (nfs_zone() != VTOMI(vp)->mi_zone)
1551                 return (EIO);
1552         if (mode & VREAD)
1553                 acc |= ACCESS3_READ;
1554         if (mode & VWRITE) {
1555                 if (vn_is_readonly(vp) && !IS_DEVVP(vp))
1556                         return (EROFS);
1557                 if (vp->v_type == VDIR)
1558                         acc |= ACCESS3_DELETE;
1559                 acc |= ACCESS3_MODIFY | ACCESS3_EXTEND;
1560         }
1561         if (mode & VEXEC) {
1562                 if (vp->v_type == VDIR)
1563                         acc |= ACCESS3_LOOKUP;
1564                 else
1565                         acc |= ACCESS3_EXECUTE;
1566         }
1567 
1568         rp = VTOR(vp);
1569         args.object = *VTOFH3(vp);
1570         if (vp->v_type == VDIR) {
1571                 args.access = ACCESS3_READ | ACCESS3_DELETE | ACCESS3_MODIFY |
1572                     ACCESS3_EXTEND | ACCESS3_LOOKUP;
1573         } else {
1574                 args.access = ACCESS3_READ | ACCESS3_MODIFY | ACCESS3_EXTEND |
1575                     ACCESS3_EXECUTE;
1576         }
1577         fi.vp = vp;
1578         fi.fhp = (caddr_t)&args.object;
1579         fi.copyproc = nfs3copyfh;
1580         fi.lookupproc = nfs3lookup;
1581         fi.xattrdirproc = acl_getxattrdir3;
1582 
1583         cred = cr;
1584         /*
1585          * ncr and ncrfree both initially
1586          * point to the memory area returned
1587          * by crnetadjust();
1588          * ncrfree not NULL when exiting means
1589          * that we need to release it
1590          */
1591         ncr = crnetadjust(cred);
1592         ncrfree = ncr;
1593 tryagain:
1594         if (rp->r_acache != NULL) {
1595                 cacc = nfs_access_check(rp, acc, cred);
1596                 if (cacc == NFS_ACCESS_ALLOWED) {
1597                         if (ncrfree != NULL)
1598                                 crfree(ncrfree);
1599                         return (0);
1600                 }
1601                 if (cacc == NFS_ACCESS_DENIED) {
1602                         /*
1603                          * If the cred can be adjusted, try again
1604                          * with the new cred.
1605                          */
1606                         if (ncr != NULL) {
1607                                 cred = ncr;
1608                                 ncr = NULL;
1609                                 goto tryagain;
1610                         }
1611                         if (ncrfree != NULL)
1612                                 crfree(ncrfree);
1613                         return (EACCES);
1614                 }
1615         }
1616 
1617         douprintf = 1;
1618 
1619         t = gethrtime();
1620 
1621         error = rfs3call(VTOMI(vp), NFSPROC3_ACCESS,
1622             xdr_ACCESS3args, (caddr_t)&args,
1623             xdr_ACCESS3res, (caddr_t)&res, cred,
1624             &douprintf, &res.status, 0, &fi);
1625 
1626         if (error) {
1627                 if (ncrfree != NULL)
1628                         crfree(ncrfree);
1629                 return (error);
1630         }
1631 
1632         error = geterrno3(res.status);
1633         if (!error) {
1634                 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
1635                 nfs_access_cache(rp, args.access, res.resok.access, cred);
1636                 /*
1637                  * we just cached results with cred; if cred is the
1638                  * adjusted credentials from crnetadjust, we do not want
1639                  * to release them before exiting: hence setting ncrfree
1640                  * to NULL
1641                  */
1642                 if (cred != cr)
1643                         ncrfree = NULL;
1644                 if ((acc & res.resok.access) != acc) {
1645                         /*
1646                          * If the cred can be adjusted, try again
1647                          * with the new cred.
1648                          */
1649                         if (ncr != NULL) {
1650                                 cred = ncr;
1651                                 ncr = NULL;
1652                                 goto tryagain;
1653                         }
1654                         error = EACCES;
1655                 }
1656         } else {
1657                 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
1658                 PURGE_STALE_FH(error, vp, cr);
1659         }
1660 
1661         if (ncrfree != NULL)
1662                 crfree(ncrfree);
1663 
1664         return (error);
1665 }
1666 
1667 static int nfs3_do_symlink_cache = 1;
1668 
1669 /* ARGSUSED */
1670 static int
1671 nfs3_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1672 {
1673         int error;
1674         READLINK3args args;
1675         READLINK3res res;
1676         nfspath3 resdata_backup;
1677         rnode_t *rp;
1678         int douprintf;
1679         int len;
1680         failinfo_t fi;
1681         hrtime_t t;
1682 
1683         /*
1684          * Can't readlink anything other than a symbolic link.
1685          */
1686         if (vp->v_type != VLNK)
1687                 return (EINVAL);
1688         if (nfs_zone() != VTOMI(vp)->mi_zone)
1689                 return (EIO);
1690 
1691         rp = VTOR(vp);
1692         if (nfs3_do_symlink_cache && rp->r_symlink.contents != NULL) {
1693                 error = nfs3_validate_caches(vp, cr);
1694                 if (error)
1695                         return (error);
1696                 mutex_enter(&rp->r_statelock);
1697                 if (rp->r_symlink.contents != NULL) {
1698                         error = uiomove(rp->r_symlink.contents,
1699                             rp->r_symlink.len, UIO_READ, uiop);
1700                         mutex_exit(&rp->r_statelock);
1701                         return (error);
1702                 }
1703                 mutex_exit(&rp->r_statelock);
1704         }
1705 
1706         args.symlink = *VTOFH3(vp);
1707         fi.vp = vp;
1708         fi.fhp = (caddr_t)&args.symlink;
1709         fi.copyproc = nfs3copyfh;
1710         fi.lookupproc = nfs3lookup;
1711         fi.xattrdirproc = acl_getxattrdir3;
1712 
1713         res.resok.data = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1714 
1715         resdata_backup = res.resok.data;
1716 
1717         douprintf = 1;
1718 
1719         t = gethrtime();
1720 
1721         error = rfs3call(VTOMI(vp), NFSPROC3_READLINK,
1722             xdr_READLINK3args, (caddr_t)&args,
1723             xdr_READLINK3res, (caddr_t)&res, cr,
1724             &douprintf, &res.status, 0, &fi);
1725 
1726         if (res.resok.data == nfs3nametoolong)
1727                 error = EINVAL;
1728 
1729         if (error) {
1730                 kmem_free(resdata_backup, MAXPATHLEN);
1731                 return (error);
1732         }
1733 
1734         error = geterrno3(res.status);
1735         if (!error) {
1736                 nfs3_cache_post_op_attr(vp, &res.resok.symlink_attributes, t,
1737                     cr);
1738                 len = strlen(res.resok.data);
1739                 error = uiomove(res.resok.data, len, UIO_READ, uiop);
1740                 if (nfs3_do_symlink_cache && rp->r_symlink.contents == NULL) {
1741                         mutex_enter(&rp->r_statelock);
1742                                 if (rp->r_symlink.contents == NULL) {
1743                                 rp->r_symlink.contents = res.resok.data;
1744                                 rp->r_symlink.len = len;
1745                                 rp->r_symlink.size = MAXPATHLEN;
1746                                 mutex_exit(&rp->r_statelock);
1747                         } else {
1748                                 mutex_exit(&rp->r_statelock);
1749 
1750                                 kmem_free((void *)res.resok.data, MAXPATHLEN);
1751                         }
1752                 } else {
1753                         kmem_free((void *)res.resok.data, MAXPATHLEN);
1754                 }
1755         } else {
1756                 nfs3_cache_post_op_attr(vp,
1757                     &res.resfail.symlink_attributes, t, cr);
1758                 PURGE_STALE_FH(error, vp, cr);
1759 
1760                 kmem_free((void *)res.resok.data, MAXPATHLEN);
1761 
1762         }
1763 
1764         /*
1765          * The over the wire error for attempting to readlink something
1766          * other than a symbolic link is ENXIO.  However, we need to
1767          * return EINVAL instead of ENXIO, so we map it here.
1768          */
1769         return (error == ENXIO ? EINVAL : error);
1770 }
1771 
1772 /*
1773  * Flush local dirty pages to stable storage on the server.
1774  *
1775  * If FNODSYNC is specified, then there is nothing to do because
1776  * metadata changes are not cached on the client before being
1777  * sent to the server.
1778  */
1779 /* ARGSUSED */
1780 static int
1781 nfs3_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1782 {
1783         int error;
1784 
1785         if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1786                 return (0);
1787         if (nfs_zone() != VTOMI(vp)->mi_zone)
1788                 return (EIO);
1789 
1790         error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
1791         if (!error)
1792                 error = VTOR(vp)->r_error;
1793         return (error);
1794 }
1795 
1796 /*
1797  * Weirdness: if the file was removed or the target of a rename
1798  * operation while it was open, it got renamed instead.  Here we
1799  * remove the renamed file.
1800  */
1801 /* ARGSUSED */
1802 static void
1803 nfs3_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1804 {
1805         rnode_t *rp;
1806 
1807         ASSERT(vp != DNLC_NO_VNODE);
1808 
1809         /*
1810          * If this is coming from the wrong zone, we let someone in the right
1811          * zone take care of it asynchronously.  We can get here due to
1812          * VN_RELE() being called from pageout() or fsflush().  This call may
1813          * potentially turn into an expensive no-op if, for instance, v_count
1814          * gets incremented in the meantime, but it's still correct.
1815          */
1816         if (nfs_zone() != VTOMI(vp)->mi_zone) {
1817                 nfs_async_inactive(vp, cr, nfs3_inactive);
1818                 return;
1819         }
1820 
1821         rp = VTOR(vp);
1822 redo:
1823         if (rp->r_unldvp != NULL) {
1824                 /*
1825                  * Save the vnode pointer for the directory where the
1826                  * unlinked-open file got renamed, then set it to NULL
1827                  * to prevent another thread from getting here before
1828                  * we're done with the remove.  While we have the
1829                  * statelock, make local copies of the pertinent rnode
1830                  * fields.  If we weren't to do this in an atomic way, the
1831                  * the unl* fields could become inconsistent with respect
1832                  * to each other due to a race condition between this
1833                  * code and nfs_remove().  See bug report 1034328.
1834                  */
1835                 mutex_enter(&rp->r_statelock);
1836                 if (rp->r_unldvp != NULL) {
1837                         vnode_t *unldvp;
1838                         char *unlname;
1839                         cred_t *unlcred;
1840                         REMOVE3args args;
1841                         REMOVE3res res;
1842                         int douprintf;
1843                         int error;
1844                         hrtime_t t;
1845 
1846                         unldvp = rp->r_unldvp;
1847                         rp->r_unldvp = NULL;
1848                         unlname = rp->r_unlname;
1849                         rp->r_unlname = NULL;
1850                         unlcred = rp->r_unlcred;
1851                         rp->r_unlcred = NULL;
1852                         mutex_exit(&rp->r_statelock);
1853 
1854                         /*
1855                          * If there are any dirty pages left, then flush
1856                          * them.  This is unfortunate because they just
1857                          * may get thrown away during the remove operation,
1858                          * but we have to do this for correctness.
1859                          */
1860                         if (vn_has_cached_data(vp) &&
1861                             ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1862                                 ASSERT(vp->v_type != VCHR);
1863                                 error = nfs3_putpage(vp, (offset_t)0, 0, 0,
1864                                     cr, ct);
1865                                 if (error) {
1866                                         mutex_enter(&rp->r_statelock);
1867                                         if (!rp->r_error)
1868                                                 rp->r_error = error;
1869                                         mutex_exit(&rp->r_statelock);
1870                                 }
1871                         }
1872 
1873                         /*
1874                          * Do the remove operation on the renamed file
1875                          */
1876                         setdiropargs3(&args.object, unlname, unldvp);
1877 
1878                         douprintf = 1;
1879 
1880                         t = gethrtime();
1881 
1882                         error = rfs3call(VTOMI(unldvp), NFSPROC3_REMOVE,
1883                             xdr_diropargs3, (caddr_t)&args,
1884                             xdr_REMOVE3res, (caddr_t)&res, unlcred,
1885                             &douprintf, &res.status, 0, NULL);
1886 
1887                         if (error) {
1888                                 PURGE_ATTRCACHE(unldvp);
1889                         } else {
1890                                 error = geterrno3(res.status);
1891                                 if (!error) {
1892                                         nfs3_cache_wcc_data(unldvp,
1893                                             &res.resok.dir_wcc, t, cr);
1894                                         if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1895                                                 nfs_purge_rddir_cache(unldvp);
1896                                 } else {
1897                                         nfs3_cache_wcc_data(unldvp,
1898                                             &res.resfail.dir_wcc, t, cr);
1899                                         PURGE_STALE_FH(error, unldvp, cr);
1900                                 }
1901                         }
1902 
1903                         /*
1904                          * Release stuff held for the remove
1905                          */
1906                         VN_RELE(unldvp);
1907                         kmem_free(unlname, MAXNAMELEN);
1908                         crfree(unlcred);
1909                         goto redo;
1910                 }
1911                 mutex_exit(&rp->r_statelock);
1912         }
1913 
1914         rp_addfree(rp, cr);
1915 }
1916 
1917 /*
1918  * Remote file system operations having to do with directory manipulation.
1919  */
1920 
1921 /* ARGSUSED */
1922 static int
1923 nfs3_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1924         int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1925         int *direntflags, pathname_t *realpnp)
1926 {
1927         int error;
1928         vnode_t *vp;
1929         vnode_t *avp = NULL;
1930         rnode_t *drp;
1931 
1932         if (nfs_zone() != VTOMI(dvp)->mi_zone)
1933                 return (EPERM);
1934 
1935         drp = VTOR(dvp);
1936 
1937         /*
1938          * Are we looking up extended attributes?  If so, "dvp" is
1939          * the file or directory for which we want attributes, and
1940          * we need a lookup of the hidden attribute directory
1941          * before we lookup the rest of the path.
1942          */
1943         if (flags & LOOKUP_XATTR) {
1944                 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1945                 mntinfo_t *mi;
1946 
1947                 mi = VTOMI(dvp);
1948                 if (!(mi->mi_flags & MI_EXTATTR))
1949                         return (EINVAL);
1950 
1951                 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1952                         return (EINTR);
1953 
1954                 (void) nfs3lookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1955                 if (avp == NULL)
1956                         error = acl_getxattrdir3(dvp, &avp, cflag, cr, 0);
1957                 else
1958                         error = 0;
1959 
1960                 nfs_rw_exit(&drp->r_rwlock);
1961 
1962                 if (error) {
1963                         if (mi->mi_flags & MI_EXTATTR)
1964                                 return (error);
1965                         return (EINVAL);
1966                 }
1967                 dvp = avp;
1968                 drp = VTOR(dvp);
1969         }
1970 
1971         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1972                 error = EINTR;
1973                 goto out;
1974         }
1975 
1976         error = nfs3lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1977 
1978         nfs_rw_exit(&drp->r_rwlock);
1979 
1980         /*
1981          * If vnode is a device, create special vnode.
1982          */
1983         if (!error && IS_DEVVP(*vpp)) {
1984                 vp = *vpp;
1985                 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1986                 VN_RELE(vp);
1987         }
1988 
1989 out:
1990         if (avp != NULL)
1991                 VN_RELE(avp);
1992 
1993         return (error);
1994 }
1995 
1996 static int nfs3_lookup_neg_cache = 1;
1997 
1998 #ifdef DEBUG
1999 static int nfs3_lookup_dnlc_hits = 0;
2000 static int nfs3_lookup_dnlc_misses = 0;
2001 static int nfs3_lookup_dnlc_neg_hits = 0;
2002 static int nfs3_lookup_dnlc_disappears = 0;
2003 static int nfs3_lookup_dnlc_lookups = 0;
2004 #endif
2005 
2006 /* ARGSUSED */
2007 int
2008 nfs3lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
2009         int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
2010 {
2011         int error;
2012         rnode_t *drp;
2013 
2014         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2015         /*
2016          * If lookup is for "", just return dvp.  Don't need
2017          * to send it over the wire, look it up in the dnlc,
2018          * or perform any access checks.
2019          */
2020         if (*nm == '\0') {
2021                 VN_HOLD(dvp);
2022                 *vpp = dvp;
2023                 return (0);
2024         }
2025 
2026         /*
2027          * Can't do lookups in non-directories.
2028          */
2029         if (dvp->v_type != VDIR)
2030                 return (ENOTDIR);
2031 
2032         /*
2033          * If we're called with RFSCALL_SOFT, it's important that
2034          * the only rfscall is one we make directly; if we permit
2035          * an access call because we're looking up "." or validating
2036          * a dnlc hit, we'll deadlock because that rfscall will not
2037          * have the RFSCALL_SOFT set.
2038          */
2039         if (rfscall_flags & RFSCALL_SOFT)
2040                 goto callit;
2041 
2042         /*
2043          * If lookup is for ".", just return dvp.  Don't need
2044          * to send it over the wire or look it up in the dnlc,
2045          * just need to check access.
2046          */
2047         if (strcmp(nm, ".") == 0) {
2048                 error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2049                 if (error)
2050                         return (error);
2051                 VN_HOLD(dvp);
2052                 *vpp = dvp;
2053                 return (0);
2054         }
2055 
2056         drp = VTOR(dvp);
2057         if (!(drp->r_flags & RLOOKUP)) {
2058                 mutex_enter(&drp->r_statelock);
2059                 drp->r_flags |= RLOOKUP;
2060                 mutex_exit(&drp->r_statelock);
2061         }
2062 
2063         /*
2064          * Lookup this name in the DNLC.  If there was a valid entry,
2065          * then return the results of the lookup.
2066          */
2067         error = nfs3lookup_dnlc(dvp, nm, vpp, cr);
2068         if (error || *vpp != NULL)
2069                 return (error);
2070 
2071 callit:
2072         error = nfs3lookup_otw(dvp, nm, vpp, cr, rfscall_flags);
2073 
2074         return (error);
2075 }
2076 
2077 static int
2078 nfs3lookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
2079 {
2080         int error;
2081         vnode_t *vp;
2082 
2083         ASSERT(*nm != '\0');
2084         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2085         /*
2086          * Lookup this name in the DNLC.  If successful, then validate
2087          * the caches and then recheck the DNLC.  The DNLC is rechecked
2088          * just in case this entry got invalidated during the call
2089          * to nfs3_validate_caches.
2090          *
2091          * An assumption is being made that it is safe to say that a
2092          * file exists which may not on the server.  Any operations to
2093          * the server will fail with ESTALE.
2094          */
2095 #ifdef DEBUG
2096         nfs3_lookup_dnlc_lookups++;
2097 #endif
2098         vp = dnlc_lookup(dvp, nm);
2099         if (vp != NULL) {
2100                 VN_RELE(vp);
2101                 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
2102                         PURGE_ATTRCACHE(dvp);
2103                 }
2104                 error = nfs3_validate_caches(dvp, cr);
2105                 if (error)
2106                         return (error);
2107                 vp = dnlc_lookup(dvp, nm);
2108                 if (vp != NULL) {
2109                         error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2110                         if (error) {
2111                                 VN_RELE(vp);
2112                                 return (error);
2113                         }
2114                         if (vp == DNLC_NO_VNODE) {
2115                                 VN_RELE(vp);
2116 #ifdef DEBUG
2117                                 nfs3_lookup_dnlc_neg_hits++;
2118 #endif
2119                                 return (ENOENT);
2120                         }
2121                         *vpp = vp;
2122 #ifdef DEBUG
2123                         nfs3_lookup_dnlc_hits++;
2124 #endif
2125                         return (0);
2126                 }
2127 #ifdef DEBUG
2128                 nfs3_lookup_dnlc_disappears++;
2129 #endif
2130         }
2131 #ifdef DEBUG
2132         else
2133                 nfs3_lookup_dnlc_misses++;
2134 #endif
2135 
2136         *vpp = NULL;
2137 
2138         return (0);
2139 }
2140 
2141 static int
2142 nfs3lookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
2143         int rfscall_flags)
2144 {
2145         int error;
2146         LOOKUP3args args;
2147         LOOKUP3vres res;
2148         int douprintf;
2149         struct vattr vattr;
2150         struct vattr dvattr;
2151         vnode_t *vp;
2152         failinfo_t fi;
2153         hrtime_t t;
2154 
2155         ASSERT(*nm != '\0');
2156         ASSERT(dvp->v_type == VDIR);
2157         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2158 
2159         setdiropargs3(&args.what, nm, dvp);
2160 
2161         fi.vp = dvp;
2162         fi.fhp = (caddr_t)&args.what.dir;
2163         fi.copyproc = nfs3copyfh;
2164         fi.lookupproc = nfs3lookup;
2165         fi.xattrdirproc = acl_getxattrdir3;
2166         res.obj_attributes.fres.vp = dvp;
2167         res.obj_attributes.fres.vap = &vattr;
2168         res.dir_attributes.fres.vp = dvp;
2169         res.dir_attributes.fres.vap = &dvattr;
2170 
2171         douprintf = 1;
2172 
2173         t = gethrtime();
2174 
2175         error = rfs3call(VTOMI(dvp), NFSPROC3_LOOKUP,
2176             xdr_diropargs3, (caddr_t)&args,
2177             xdr_LOOKUP3vres, (caddr_t)&res, cr,
2178             &douprintf, &res.status, rfscall_flags, &fi);
2179 
2180         if (error)
2181                 return (error);
2182 
2183         nfs3_cache_post_op_vattr(dvp, &res.dir_attributes, t, cr);
2184 
2185         error = geterrno3(res.status);
2186         if (error) {
2187                 PURGE_STALE_FH(error, dvp, cr);
2188                 if (error == ENOENT && nfs3_lookup_neg_cache)
2189                         dnlc_enter(dvp, nm, DNLC_NO_VNODE);
2190                 return (error);
2191         }
2192 
2193         if (res.obj_attributes.attributes) {
2194                 vp = makenfs3node_va(&res.object, res.obj_attributes.fres.vap,
2195                     dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2196         } else {
2197                 vp = makenfs3node_va(&res.object, NULL,
2198                     dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2199                 if (vp->v_type == VNON) {
2200                         vattr.va_mask = AT_TYPE;
2201                         error = nfs3getattr(vp, &vattr, cr);
2202                         if (error) {
2203                                 VN_RELE(vp);
2204                                 return (error);
2205                         }
2206                         vp->v_type = vattr.va_type;
2207                 }
2208         }
2209 
2210         if (!(rfscall_flags & RFSCALL_SOFT))
2211                 dnlc_update(dvp, nm, vp);
2212 
2213         *vpp = vp;
2214 
2215         return (error);
2216 }
2217 
2218 #ifdef DEBUG
2219 static int nfs3_create_misses = 0;
2220 #endif
2221 
2222 /* ARGSUSED */
2223 static int
2224 nfs3_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2225         int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
2226         vsecattr_t *vsecp)
2227 {
2228         int error;
2229         vnode_t *vp;
2230         rnode_t *rp;
2231         struct vattr vattr;
2232         rnode_t *drp;
2233         vnode_t *tempvp;
2234 
2235         drp = VTOR(dvp);
2236         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2237                 return (EPERM);
2238         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2239                 return (EINTR);
2240 
2241 top:
2242         /*
2243          * We make a copy of the attributes because the caller does not
2244          * expect us to change what va points to.
2245          */
2246         vattr = *va;
2247 
2248         /*
2249          * If the pathname is "", just use dvp.  Don't need
2250          * to send it over the wire, look it up in the dnlc,
2251          * or perform any access checks.
2252          */
2253         if (*nm == '\0') {
2254                 error = 0;
2255                 VN_HOLD(dvp);
2256                 vp = dvp;
2257         /*
2258          * If the pathname is ".", just use dvp.  Don't need
2259          * to send it over the wire or look it up in the dnlc,
2260          * just need to check access.
2261          */
2262         } else if (strcmp(nm, ".") == 0) {
2263                 error = nfs3_access(dvp, VEXEC, 0, cr, ct);
2264                 if (error) {
2265                         nfs_rw_exit(&drp->r_rwlock);
2266                         return (error);
2267                 }
2268                 VN_HOLD(dvp);
2269                 vp = dvp;
2270         /*
2271          * We need to go over the wire, just to be sure whether the
2272          * file exists or not.  Using the DNLC can be dangerous in
2273          * this case when making a decision regarding existence.
2274          */
2275         } else {
2276                 error = nfs3lookup_otw(dvp, nm, &vp, cr, 0);
2277         }
2278         if (!error) {
2279                 if (exclusive == EXCL)
2280                         error = EEXIST;
2281                 else if (vp->v_type == VDIR && (mode & VWRITE))
2282                         error = EISDIR;
2283                 else {
2284                         /*
2285                          * If vnode is a device, create special vnode.
2286                          */
2287                         if (IS_DEVVP(vp)) {
2288                                 tempvp = vp;
2289                                 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2290                                 VN_RELE(tempvp);
2291                         }
2292                         if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2293                                 if ((vattr.va_mask & AT_SIZE) &&
2294                                     vp->v_type == VREG) {
2295                                         rp = VTOR(vp);
2296                                         /*
2297                                          * Check here for large file handled
2298                                          * by LF-unaware process (as
2299                                          * ufs_create() does)
2300                                          */
2301                                         if (!(lfaware & FOFFMAX)) {
2302                                                 mutex_enter(&rp->r_statelock);
2303                                                 if (rp->r_size > MAXOFF32_T)
2304                                                         error = EOVERFLOW;
2305                                                 mutex_exit(&rp->r_statelock);
2306                                         }
2307                                         if (!error) {
2308                                                 vattr.va_mask = AT_SIZE;
2309                                                 error = nfs3setattr(vp,
2310                                                     &vattr, 0, cr);
2311 
2312                                                 /*
2313                                                  * Existing file was truncated;
2314                                                  * emit a create event.
2315                                                  */
2316                                                 vnevent_create(vp, ct);
2317                                         }
2318                                 }
2319                         }
2320                 }
2321                 nfs_rw_exit(&drp->r_rwlock);
2322                 if (error) {
2323                         VN_RELE(vp);
2324                 } else {
2325                         *vpp = vp;
2326                 }
2327 
2328                 return (error);
2329         }
2330 
2331         dnlc_remove(dvp, nm);
2332 
2333         /*
2334          * Decide what the group-id of the created file should be.
2335          * Set it in attribute list as advisory...
2336          */
2337         error = setdirgid(dvp, &vattr.va_gid, cr);
2338         if (error) {
2339                 nfs_rw_exit(&drp->r_rwlock);
2340                 return (error);
2341         }
2342         vattr.va_mask |= AT_GID;
2343 
2344         ASSERT(vattr.va_mask & AT_TYPE);
2345         if (vattr.va_type == VREG) {
2346                 ASSERT(vattr.va_mask & AT_MODE);
2347                 if (MANDMODE(vattr.va_mode)) {
2348                         nfs_rw_exit(&drp->r_rwlock);
2349                         return (EACCES);
2350                 }
2351                 error = nfs3create(dvp, nm, &vattr, exclusive, mode, vpp, cr,
2352                     lfaware);
2353                 /*
2354                  * If this is not an exclusive create, then the CREATE
2355                  * request will be made with the GUARDED mode set.  This
2356                  * means that the server will return EEXIST if the file
2357                  * exists.  The file could exist because of a retransmitted
2358                  * request.  In this case, we recover by starting over and
2359                  * checking to see whether the file exists.  This second
2360                  * time through it should and a CREATE request will not be
2361                  * sent.
2362                  *
2363                  * This handles the problem of a dangling CREATE request
2364                  * which contains attributes which indicate that the file
2365                  * should be truncated.  This retransmitted request could
2366                  * possibly truncate valid data in the file if not caught
2367                  * by the duplicate request mechanism on the server or if
2368                  * not caught by other means.  The scenario is:
2369                  *
2370                  * Client transmits CREATE request with size = 0
2371                  * Client times out, retransmits request.
2372                  * Response to the first request arrives from the server
2373                  *  and the client proceeds on.
2374                  * Client writes data to the file.
2375                  * The server now processes retransmitted CREATE request
2376                  *  and truncates file.
2377                  *
2378                  * The use of the GUARDED CREATE request prevents this from
2379                  * happening because the retransmitted CREATE would fail
2380                  * with EEXIST and would not truncate the file.
2381                  */
2382                 if (error == EEXIST && exclusive == NONEXCL) {
2383 #ifdef DEBUG
2384                         nfs3_create_misses++;
2385 #endif
2386                         goto top;
2387                 }
2388                 nfs_rw_exit(&drp->r_rwlock);
2389                 return (error);
2390         }
2391         error = nfs3mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
2392         nfs_rw_exit(&drp->r_rwlock);
2393         return (error);
2394 }
2395 
2396 /* ARGSUSED */
2397 static int
2398 nfs3create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2399         int mode, vnode_t **vpp, cred_t *cr, int lfaware)
2400 {
2401         int error;
2402         CREATE3args args;
2403         CREATE3res res;
2404         int douprintf;
2405         vnode_t *vp;
2406         struct vattr vattr;
2407         nfstime3 *verfp;
2408         rnode_t *rp;
2409         timestruc_t now;
2410         hrtime_t t;
2411 
2412         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2413         setdiropargs3(&args.where, nm, dvp);
2414         if (exclusive == EXCL) {
2415                 args.how.mode = EXCLUSIVE;
2416                 /*
2417                  * Construct the create verifier.  This verifier needs
2418                  * to be unique between different clients.  It also needs
2419                  * to vary for each exclusive create request generated
2420                  * from the client to the server.
2421                  *
2422                  * The first attempt is made to use the hostid and a
2423                  * unique number on the client.  If the hostid has not
2424                  * been set, the high resolution time that the exclusive
2425                  * create request is being made is used.  This will work
2426                  * unless two different clients, both with the hostid
2427                  * not set, attempt an exclusive create request on the
2428                  * same file, at exactly the same clock time.  The
2429                  * chances of this happening seem small enough to be
2430                  * reasonable.
2431                  */
2432                 verfp = (nfstime3 *)&args.how.createhow3_u.verf;
2433                 verfp->seconds = zone_get_hostid(NULL);
2434                 if (verfp->seconds != 0)
2435                         verfp->nseconds = newnum();
2436                 else {
2437                         gethrestime(&now);
2438                         verfp->seconds = now.tv_sec;
2439                         verfp->nseconds = now.tv_nsec;
2440                 }
2441                 /*
2442                  * Since the server will use this value for the mtime,
2443                  * make sure that it can't overflow. Zero out the MSB.
2444                  * The actual value does not matter here, only its uniqeness.
2445                  */
2446                 verfp->seconds %= INT32_MAX;
2447         } else {
2448                 /*
2449                  * Issue the non-exclusive create in guarded mode.  This
2450                  * may result in some false EEXIST responses for
2451                  * retransmitted requests, but these will be handled at
2452                  * a higher level.  By using GUARDED, duplicate requests
2453                  * to do file truncation and possible access problems
2454                  * can be avoided.
2455                  */
2456                 args.how.mode = GUARDED;
2457                 error = vattr_to_sattr3(va,
2458                     &args.how.createhow3_u.obj_attributes);
2459                 if (error) {
2460                         /* req time field(s) overflow - return immediately */
2461                         return (error);
2462                 }
2463         }
2464 
2465         douprintf = 1;
2466 
2467         t = gethrtime();
2468 
2469         error = rfs3call(VTOMI(dvp), NFSPROC3_CREATE,
2470             xdr_CREATE3args, (caddr_t)&args,
2471             xdr_CREATE3res, (caddr_t)&res, cr,
2472             &douprintf, &res.status, 0, NULL);
2473 
2474         if (error) {
2475                 PURGE_ATTRCACHE(dvp);
2476                 return (error);
2477         }
2478 
2479         error = geterrno3(res.status);
2480         if (!error) {
2481                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2482                 if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2483                         nfs_purge_rddir_cache(dvp);
2484 
2485                 /*
2486                  * On exclusive create the times need to be explicitly
2487                  * set to clear any potential verifier that may be stored
2488                  * in one of these fields (see comment below).  This
2489                  * is done here to cover the case where no post op attrs
2490                  * were returned or a 'invalid' time was returned in
2491                  * the attributes.
2492                  */
2493                 if (exclusive == EXCL)
2494                         va->va_mask |= (AT_MTIME | AT_ATIME);
2495 
2496                 if (!res.resok.obj.handle_follows) {
2497                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2498                         if (error)
2499                                 return (error);
2500                 } else {
2501                         if (res.resok.obj_attributes.attributes) {
2502                                 vp = makenfs3node(&res.resok.obj.handle,
2503                                     &res.resok.obj_attributes.attr,
2504                                     dvp->v_vfsp, t, cr, NULL, NULL);
2505                         } else {
2506                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
2507                                     dvp->v_vfsp, t, cr, NULL, NULL);
2508 
2509                                 /*
2510                                  * On an exclusive create, it is possible
2511                                  * that attributes were returned but those
2512                                  * postop attributes failed to decode
2513                                  * properly.  If this is the case,
2514                                  * then most likely the atime or mtime
2515                                  * were invalid for our client; this
2516                                  * is caused by the server storing the
2517                                  * create verifier in one of the time
2518                                  * fields(most likely mtime).
2519                                  * So... we are going to setattr just the
2520                                  * atime/mtime to clear things up.
2521                                  */
2522                                 if (exclusive == EXCL) {
2523                                         if (error =
2524                                             nfs3excl_create_settimes(vp,
2525                                             va, cr)) {
2526                                                 /*
2527                                                  * Setting the times failed.
2528                                                  * Remove the file and return
2529                                                  * the error.
2530                                                  */
2531                                                 VN_RELE(vp);
2532                                                 (void) nfs3_remove(dvp,
2533                                                     nm, cr, NULL, 0);
2534                                                 return (error);
2535                                         }
2536                                 }
2537 
2538                                 /*
2539                                  * This handles the non-exclusive case
2540                                  * and the exclusive case where no post op
2541                                  * attrs were returned.
2542                                  */
2543                                 if (vp->v_type == VNON) {
2544                                         vattr.va_mask = AT_TYPE;
2545                                         error = nfs3getattr(vp, &vattr, cr);
2546                                         if (error) {
2547                                                 VN_RELE(vp);
2548                                                 return (error);
2549                                         }
2550                                         vp->v_type = vattr.va_type;
2551                                 }
2552                         }
2553                         dnlc_update(dvp, nm, vp);
2554                 }
2555 
2556                 rp = VTOR(vp);
2557 
2558                 /*
2559                  * Check here for large file handled by
2560                  * LF-unaware process (as ufs_create() does)
2561                  */
2562                 if ((va->va_mask & AT_SIZE) && vp->v_type == VREG &&
2563                     !(lfaware & FOFFMAX)) {
2564                         mutex_enter(&rp->r_statelock);
2565                         if (rp->r_size > MAXOFF32_T) {
2566                                 mutex_exit(&rp->r_statelock);
2567                                 VN_RELE(vp);
2568                                 return (EOVERFLOW);
2569                         }
2570                         mutex_exit(&rp->r_statelock);
2571                 }
2572 
2573                 if (exclusive == EXCL &&
2574                     (va->va_mask & ~(AT_GID | AT_SIZE))) {
2575                         /*
2576                          * If doing an exclusive create, then generate
2577                          * a SETATTR to set the initial attributes.
2578                          * Try to set the mtime and the atime to the
2579                          * server's current time.  It is somewhat
2580                          * expected that these fields will be used to
2581                          * store the exclusive create cookie.  If not,
2582                          * server implementors will need to know that
2583                          * a SETATTR will follow an exclusive create
2584                          * and the cookie should be destroyed if
2585                          * appropriate. This work may have been done
2586                          * earlier in this function if post op attrs
2587                          * were not available.
2588                          *
2589                          * The AT_GID and AT_SIZE bits are turned off
2590                          * so that the SETATTR request will not attempt
2591                          * to process these.  The gid will be set
2592                          * separately if appropriate.  The size is turned
2593                          * off because it is assumed that a new file will
2594                          * be created empty and if the file wasn't empty,
2595                          * then the exclusive create will have failed
2596                          * because the file must have existed already.
2597                          * Therefore, no truncate operation is needed.
2598                          */
2599                         va->va_mask &= ~(AT_GID | AT_SIZE);
2600                         error = nfs3setattr(vp, va, 0, cr);
2601                         if (error) {
2602                                 /*
2603                                  * Couldn't correct the attributes of
2604                                  * the newly created file and the
2605                                  * attributes are wrong.  Remove the
2606                                  * file and return an error to the
2607                                  * application.
2608                                  */
2609                                 VN_RELE(vp);
2610                                 (void) nfs3_remove(dvp, nm, cr, NULL, 0);
2611                                 return (error);
2612                         }
2613                 }
2614 
2615                 if (va->va_gid != rp->r_attr.va_gid) {
2616                         /*
2617                          * If the gid on the file isn't right, then
2618                          * generate a SETATTR to attempt to change
2619                          * it.  This may or may not work, depending
2620                          * upon the server's semantics for allowing
2621                          * file ownership changes.
2622                          */
2623                         va->va_mask = AT_GID;
2624                         (void) nfs3setattr(vp, va, 0, cr);
2625                 }
2626 
2627                 /*
2628                  * If vnode is a device create special vnode
2629                  */
2630                 if (IS_DEVVP(vp)) {
2631                         *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2632                         VN_RELE(vp);
2633                 } else
2634                         *vpp = vp;
2635         } else {
2636                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2637                 PURGE_STALE_FH(error, dvp, cr);
2638         }
2639 
2640         return (error);
2641 }
2642 
2643 /*
2644  * Special setattr function to take care of rest of atime/mtime
2645  * after successful exclusive create.  This function exists to avoid
2646  * handling attributes from the server; exclusive the atime/mtime fields
2647  * may be 'invalid' in client's view and therefore can not be trusted.
2648  */
2649 static int
2650 nfs3excl_create_settimes(vnode_t *vp, struct vattr *vap, cred_t *cr)
2651 {
2652         int error;
2653         uint_t mask;
2654         SETATTR3args args;
2655         SETATTR3res res;
2656         int douprintf;
2657         rnode_t *rp;
2658         hrtime_t t;
2659 
2660         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
2661         /* save the caller's mask so that it can be reset later */
2662         mask = vap->va_mask;
2663 
2664         rp = VTOR(vp);
2665 
2666         args.object = *RTOFH3(rp);
2667         args.guard.check = FALSE;
2668 
2669         /* Use the mask to initialize the arguments */
2670         vap->va_mask = 0;
2671         error = vattr_to_sattr3(vap, &args.new_attributes);
2672 
2673         /* We want to set just atime/mtime on this request */
2674         args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
2675         args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
2676 
2677         douprintf = 1;
2678 
2679         t = gethrtime();
2680 
2681         error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
2682             xdr_SETATTR3args, (caddr_t)&args,
2683             xdr_SETATTR3res, (caddr_t)&res, cr,
2684             &douprintf, &res.status, 0, NULL);
2685 
2686         if (error) {
2687                 vap->va_mask = mask;
2688                 return (error);
2689         }
2690 
2691         error = geterrno3(res.status);
2692         if (!error) {
2693                 /*
2694                  * It is important to pick up the attributes.
2695                  * Since this is the exclusive create path, the
2696                  * attributes on the initial create were ignored
2697                  * and we need these to have the correct info.
2698                  */
2699                 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
2700                 /*
2701                  * No need to do the atime/mtime work again so clear
2702                  * the bits.
2703                  */
2704                 mask &= ~(AT_ATIME | AT_MTIME);
2705         } else {
2706                 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
2707         }
2708 
2709         vap->va_mask = mask;
2710 
2711         return (error);
2712 }
2713 
2714 /* ARGSUSED */
2715 static int
2716 nfs3mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2717         int mode, vnode_t **vpp, cred_t *cr)
2718 {
2719         int error;
2720         MKNOD3args args;
2721         MKNOD3res res;
2722         int douprintf;
2723         vnode_t *vp;
2724         struct vattr vattr;
2725         hrtime_t t;
2726 
2727         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2728         switch (va->va_type) {
2729         case VCHR:
2730         case VBLK:
2731                 setdiropargs3(&args.where, nm, dvp);
2732                 args.what.type = (va->va_type == VCHR) ? NF3CHR : NF3BLK;
2733                 error = vattr_to_sattr3(va,
2734                     &args.what.mknoddata3_u.device.dev_attributes);
2735                 if (error) {
2736                         /* req time field(s) overflow - return immediately */
2737                         return (error);
2738                 }
2739                 args.what.mknoddata3_u.device.spec.specdata1 =
2740                     getmajor(va->va_rdev);
2741                 args.what.mknoddata3_u.device.spec.specdata2 =
2742                     getminor(va->va_rdev);
2743                 break;
2744 
2745         case VFIFO:
2746         case VSOCK:
2747                 setdiropargs3(&args.where, nm, dvp);
2748                 args.what.type = (va->va_type == VFIFO) ? NF3FIFO : NF3SOCK;
2749                 error = vattr_to_sattr3(va,
2750                     &args.what.mknoddata3_u.pipe_attributes);
2751                 if (error) {
2752                         /* req time field(s) overflow - return immediately */
2753                         return (error);
2754                 }
2755                 break;
2756 
2757         default:
2758                 return (EINVAL);
2759         }
2760 
2761         douprintf = 1;
2762 
2763         t = gethrtime();
2764 
2765         error = rfs3call(VTOMI(dvp), NFSPROC3_MKNOD,
2766             xdr_MKNOD3args, (caddr_t)&args,
2767             xdr_MKNOD3res, (caddr_t)&res, cr,
2768             &douprintf, &res.status, 0, NULL);
2769 
2770         if (error) {
2771                 PURGE_ATTRCACHE(dvp);
2772                 return (error);
2773         }
2774 
2775         error = geterrno3(res.status);
2776         if (!error) {
2777                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2778                 if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2779                         nfs_purge_rddir_cache(dvp);
2780 
2781                 if (!res.resok.obj.handle_follows) {
2782                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2783                         if (error)
2784                                 return (error);
2785                 } else {
2786                         if (res.resok.obj_attributes.attributes) {
2787                                 vp = makenfs3node(&res.resok.obj.handle,
2788                                     &res.resok.obj_attributes.attr,
2789                                     dvp->v_vfsp, t, cr, NULL, NULL);
2790                         } else {
2791                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
2792                                     dvp->v_vfsp, t, cr, NULL, NULL);
2793                                 if (vp->v_type == VNON) {
2794                                         vattr.va_mask = AT_TYPE;
2795                                         error = nfs3getattr(vp, &vattr, cr);
2796                                         if (error) {
2797                                                 VN_RELE(vp);
2798                                                 return (error);
2799                                         }
2800                                         vp->v_type = vattr.va_type;
2801                                 }
2802 
2803                         }
2804                         dnlc_update(dvp, nm, vp);
2805                 }
2806 
2807                 if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
2808                         va->va_mask = AT_GID;
2809                         (void) nfs3setattr(vp, va, 0, cr);
2810                 }
2811 
2812                 /*
2813                  * If vnode is a device create special vnode
2814                  */
2815                 if (IS_DEVVP(vp)) {
2816                         *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2817                         VN_RELE(vp);
2818                 } else
2819                         *vpp = vp;
2820         } else {
2821                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2822                 PURGE_STALE_FH(error, dvp, cr);
2823         }
2824         return (error);
2825 }
2826 
2827 /*
2828  * Weirdness: if the vnode to be removed is open
2829  * we rename it instead of removing it and nfs_inactive
2830  * will remove the new name.
2831  */
2832 /* ARGSUSED */
2833 static int
2834 nfs3_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2835 {
2836         int error;
2837         REMOVE3args args;
2838         REMOVE3res res;
2839         vnode_t *vp;
2840         char *tmpname;
2841         int douprintf;
2842         rnode_t *rp;
2843         rnode_t *drp;
2844         hrtime_t t;
2845 
2846         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2847                 return (EPERM);
2848         drp = VTOR(dvp);
2849         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2850                 return (EINTR);
2851 
2852         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2853         if (error) {
2854                 nfs_rw_exit(&drp->r_rwlock);
2855                 return (error);
2856         }
2857 
2858         if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2859                 VN_RELE(vp);
2860                 nfs_rw_exit(&drp->r_rwlock);
2861                 return (EPERM);
2862         }
2863 
2864         /*
2865          * First just remove the entry from the name cache, as it
2866          * is most likely the only entry for this vp.
2867          */
2868         dnlc_remove(dvp, nm);
2869 
2870         /*
2871          * If the file has a v_count > 1 then there may be more than one
2872          * entry in the name cache due multiple links or an open file,
2873          * but we don't have the real reference count so flush all
2874          * possible entries.
2875          */
2876         if (vp->v_count > 1)
2877                 dnlc_purge_vp(vp);
2878 
2879         /*
2880          * Now we have the real reference count on the vnode
2881          */
2882         rp = VTOR(vp);
2883         mutex_enter(&rp->r_statelock);
2884         if (vp->v_count > 1 &&
2885             (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2886                 mutex_exit(&rp->r_statelock);
2887                 tmpname = newname();
2888                 error = nfs3rename(dvp, nm, dvp, tmpname, cr, ct);
2889                 if (error)
2890                         kmem_free(tmpname, MAXNAMELEN);
2891                 else {
2892                         mutex_enter(&rp->r_statelock);
2893                         if (rp->r_unldvp == NULL) {
2894                                 VN_HOLD(dvp);
2895                                 rp->r_unldvp = dvp;
2896                                 if (rp->r_unlcred != NULL)
2897                                         crfree(rp->r_unlcred);
2898                                 crhold(cr);
2899                                 rp->r_unlcred = cr;
2900                                 rp->r_unlname = tmpname;
2901                         } else {
2902                                 kmem_free(rp->r_unlname, MAXNAMELEN);
2903                                 rp->r_unlname = tmpname;
2904                         }
2905                         mutex_exit(&rp->r_statelock);
2906                 }
2907         } else {
2908                 mutex_exit(&rp->r_statelock);
2909                 /*
2910                  * We need to flush any dirty pages which happen to
2911                  * be hanging around before removing the file.  This
2912                  * shouldn't happen very often and mostly on file
2913                  * systems mounted "nocto".
2914                  */
2915                 if (vn_has_cached_data(vp) &&
2916                     ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2917                         error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2918                         if (error && (error == ENOSPC || error == EDQUOT)) {
2919                                 mutex_enter(&rp->r_statelock);
2920                                 if (!rp->r_error)
2921                                         rp->r_error = error;
2922                                 mutex_exit(&rp->r_statelock);
2923                         }
2924                 }
2925 
2926                 setdiropargs3(&args.object, nm, dvp);
2927 
2928                 douprintf = 1;
2929 
2930                 t = gethrtime();
2931 
2932                 error = rfs3call(VTOMI(dvp), NFSPROC3_REMOVE,
2933                     xdr_diropargs3, (caddr_t)&args,
2934                     xdr_REMOVE3res, (caddr_t)&res, cr,
2935                     &douprintf, &res.status, 0, NULL);
2936 
2937                 /*
2938                  * The xattr dir may be gone after last attr is removed,
2939                  * so flush it from dnlc.
2940                  */
2941                 if (dvp->v_flag & V_XATTRDIR)
2942                         dnlc_purge_vp(dvp);
2943 
2944                 PURGE_ATTRCACHE(vp);
2945 
2946                 if (error) {
2947                         PURGE_ATTRCACHE(dvp);
2948                 } else {
2949                         error = geterrno3(res.status);
2950                         if (!error) {
2951                                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t,
2952                                     cr);
2953                                 if (HAVE_RDDIR_CACHE(drp))
2954                                         nfs_purge_rddir_cache(dvp);
2955                         } else {
2956                                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc,
2957                                     t, cr);
2958                                 PURGE_STALE_FH(error, dvp, cr);
2959                         }
2960                 }
2961         }
2962 
2963         if (error == 0) {
2964                 vnevent_remove(vp, dvp, nm, ct);
2965         }
2966         VN_RELE(vp);
2967 
2968         nfs_rw_exit(&drp->r_rwlock);
2969 
2970         return (error);
2971 }
2972 
2973 /* ARGSUSED */
2974 static int
2975 nfs3_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2976         caller_context_t *ct, int flags)
2977 {
2978         int error;
2979         LINK3args args;
2980         LINK3res res;
2981         vnode_t *realvp;
2982         int douprintf;
2983         mntinfo_t *mi;
2984         rnode_t *tdrp;
2985         hrtime_t t;
2986 
2987         if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2988                 return (EPERM);
2989         if (VOP_REALVP(svp, &realvp, ct) == 0)
2990                 svp = realvp;
2991 
2992         mi = VTOMI(svp);
2993 
2994         if (!(mi->mi_flags & MI_LINK))
2995                 return (EOPNOTSUPP);
2996 
2997         args.file = *VTOFH3(svp);
2998         setdiropargs3(&args.link, tnm, tdvp);
2999 
3000         tdrp = VTOR(tdvp);
3001         if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
3002                 return (EINTR);
3003 
3004         dnlc_remove(tdvp, tnm);
3005 
3006         douprintf = 1;
3007 
3008         t = gethrtime();
3009 
3010         error = rfs3call(mi, NFSPROC3_LINK,
3011             xdr_LINK3args, (caddr_t)&args,
3012             xdr_LINK3res, (caddr_t)&res, cr,
3013             &douprintf, &res.status, 0, NULL);
3014 
3015         if (error) {
3016                 PURGE_ATTRCACHE(tdvp);
3017                 PURGE_ATTRCACHE(svp);
3018                 nfs_rw_exit(&tdrp->r_rwlock);
3019                 return (error);
3020         }
3021 
3022         error = geterrno3(res.status);
3023 
3024         if (!error) {
3025                 nfs3_cache_post_op_attr(svp, &res.resok.file_attributes, t, cr);
3026                 nfs3_cache_wcc_data(tdvp, &res.resok.linkdir_wcc, t, cr);
3027                 if (HAVE_RDDIR_CACHE(tdrp))
3028                         nfs_purge_rddir_cache(tdvp);
3029                 dnlc_update(tdvp, tnm, svp);
3030         } else {
3031                 nfs3_cache_post_op_attr(svp, &res.resfail.file_attributes, t,
3032                     cr);
3033                 nfs3_cache_wcc_data(tdvp, &res.resfail.linkdir_wcc, t, cr);
3034                 if (error == EOPNOTSUPP) {
3035                         mutex_enter(&mi->mi_lock);
3036                         mi->mi_flags &= ~MI_LINK;
3037                         mutex_exit(&mi->mi_lock);
3038                 }
3039         }
3040 
3041         nfs_rw_exit(&tdrp->r_rwlock);
3042 
3043         if (!error) {
3044                 /*
3045                  * Notify the source file of this link operation.
3046                  */
3047                 vnevent_link(svp, ct);
3048         }
3049         return (error);
3050 }
3051 
3052 /* ARGSUSED */
3053 static int
3054 nfs3_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3055         caller_context_t *ct, int flags)
3056 {
3057         vnode_t *realvp;
3058 
3059         if (nfs_zone() != VTOMI(odvp)->mi_zone)
3060                 return (EPERM);
3061         if (VOP_REALVP(ndvp, &realvp, ct) == 0)
3062                 ndvp = realvp;
3063 
3064         return (nfs3rename(odvp, onm, ndvp, nnm, cr, ct));
3065 }
3066 
3067 /*
3068  * nfs3rename does the real work of renaming in NFS Version 3.
3069  */
3070 static int
3071 nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3072     caller_context_t *ct)
3073 {
3074         int error;
3075         RENAME3args args;
3076         RENAME3res res;
3077         int douprintf;
3078         vnode_t *nvp = NULL;
3079         vnode_t *ovp = NULL;
3080         char *tmpname;
3081         rnode_t *rp;
3082         rnode_t *odrp;
3083         rnode_t *ndrp;
3084         hrtime_t t;
3085 
3086         ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
3087 
3088         if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
3089             strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
3090                 return (EINVAL);
3091 
3092         odrp = VTOR(odvp);
3093         ndrp = VTOR(ndvp);
3094         if ((intptr_t)odrp < (intptr_t)ndrp) {
3095                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
3096                         return (EINTR);
3097                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
3098                         nfs_rw_exit(&odrp->r_rwlock);
3099                         return (EINTR);
3100                 }
3101         } else {
3102                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
3103                         return (EINTR);
3104                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
3105                         nfs_rw_exit(&ndrp->r_rwlock);
3106                         return (EINTR);
3107                 }
3108         }
3109 
3110         /*
3111          * Lookup the target file.  If it exists, it needs to be
3112          * checked to see whether it is a mount point and whether
3113          * it is active (open).
3114          */
3115         error = nfs3lookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
3116         if (!error) {
3117                 /*
3118                  * If this file has been mounted on, then just
3119                  * return busy because renaming to it would remove
3120                  * the mounted file system from the name space.
3121                  */
3122                 if (vn_mountedvfs(nvp) != NULL) {
3123                         VN_RELE(nvp);
3124                         nfs_rw_exit(&odrp->r_rwlock);
3125                         nfs_rw_exit(&ndrp->r_rwlock);
3126                         return (EBUSY);
3127                 }
3128 
3129                 /*
3130                  * Purge the name cache of all references to this vnode
3131                  * so that we can check the reference count to infer
3132                  * whether it is active or not.
3133                  */
3134                 /*
3135                  * First just remove the entry from the name cache, as it
3136                  * is most likely the only entry for this vp.
3137                  */
3138                 dnlc_remove(ndvp, nnm);
3139                 /*
3140                  * If the file has a v_count > 1 then there may be more
3141                  * than one entry in the name cache due multiple links
3142                  * or an open file, but we don't have the real reference
3143                  * count so flush all possible entries.
3144                  */
3145                 if (nvp->v_count > 1)
3146                         dnlc_purge_vp(nvp);
3147 
3148                 /*
3149                  * If the vnode is active and is not a directory,
3150                  * arrange to rename it to a
3151                  * temporary file so that it will continue to be
3152                  * accessible.  This implements the "unlink-open-file"
3153                  * semantics for the target of a rename operation.
3154                  * Before doing this though, make sure that the
3155                  * source and target files are not already the same.
3156                  */
3157                 if (nvp->v_count > 1 && nvp->v_type != VDIR) {
3158                         /*
3159                          * Lookup the source name.
3160                          */
3161                         error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL,
3162                             cr, 0);
3163 
3164                         /*
3165                          * The source name *should* already exist.
3166                          */
3167                         if (error) {
3168                                 VN_RELE(nvp);
3169                                 nfs_rw_exit(&odrp->r_rwlock);
3170                                 nfs_rw_exit(&ndrp->r_rwlock);
3171                                 return (error);
3172                         }
3173 
3174                         /*
3175                          * Compare the two vnodes.  If they are the same,
3176                          * just release all held vnodes and return success.
3177                          */
3178                         if (ovp == nvp) {
3179                                 VN_RELE(ovp);
3180                                 VN_RELE(nvp);
3181                                 nfs_rw_exit(&odrp->r_rwlock);
3182                                 nfs_rw_exit(&ndrp->r_rwlock);
3183                                 return (0);
3184                         }
3185 
3186                         /*
3187                          * Can't mix and match directories and non-
3188                          * directories in rename operations.  We already
3189                          * know that the target is not a directory.  If
3190                          * the source is a directory, return an error.
3191                          */
3192                         if (ovp->v_type == VDIR) {
3193                                 VN_RELE(ovp);
3194                                 VN_RELE(nvp);
3195                                 nfs_rw_exit(&odrp->r_rwlock);
3196                                 nfs_rw_exit(&ndrp->r_rwlock);
3197                                 return (ENOTDIR);
3198                         }
3199 
3200                         /*
3201                          * The target file exists, is not the same as
3202                          * the source file, and is active.  Link it
3203                          * to a temporary filename to avoid having
3204                          * the server removing the file completely.
3205                          */
3206                         tmpname = newname();
3207                         error = nfs3_link(ndvp, nvp, tmpname, cr, NULL, 0);
3208                         if (error == EOPNOTSUPP) {
3209                                 error = nfs3_rename(ndvp, nnm, ndvp, tmpname,
3210                                     cr, NULL, 0);
3211                         }
3212                         if (error) {
3213                                 kmem_free(tmpname, MAXNAMELEN);
3214                                 VN_RELE(ovp);
3215                                 VN_RELE(nvp);
3216                                 nfs_rw_exit(&odrp->r_rwlock);
3217                                 nfs_rw_exit(&ndrp->r_rwlock);
3218                                 return (error);
3219                         }
3220                         rp = VTOR(nvp);
3221                         mutex_enter(&rp->r_statelock);
3222                         if (rp->r_unldvp == NULL) {
3223                                 VN_HOLD(ndvp);
3224                                 rp->r_unldvp = ndvp;
3225                                 if (rp->r_unlcred != NULL)
3226                                         crfree(rp->r_unlcred);
3227                                 crhold(cr);
3228                                 rp->r_unlcred = cr;
3229                                 rp->r_unlname = tmpname;
3230                         } else {
3231                                 kmem_free(rp->r_unlname, MAXNAMELEN);
3232                                 rp->r_unlname = tmpname;
3233                         }
3234                         mutex_exit(&rp->r_statelock);
3235                 }
3236         }
3237 
3238         if (ovp == NULL) {
3239                 /*
3240                  * When renaming directories to be a subdirectory of a
3241                  * different parent, the dnlc entry for ".." will no
3242                  * longer be valid, so it must be removed.
3243                  *
3244                  * We do a lookup here to determine whether we are renaming
3245                  * a directory and we need to check if we are renaming
3246                  * an unlinked file.  This might have already been done
3247                  * in previous code, so we check ovp == NULL to avoid
3248                  * doing it twice.
3249                  */
3250 
3251                 error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
3252                 /*
3253                  * The source name *should* already exist.
3254                  */
3255                 if (error) {
3256                         nfs_rw_exit(&odrp->r_rwlock);
3257                         nfs_rw_exit(&ndrp->r_rwlock);
3258                         if (nvp) {
3259                                 VN_RELE(nvp);
3260                         }
3261                         return (error);
3262                 }
3263                 ASSERT(ovp != NULL);
3264         }
3265 
3266         dnlc_remove(odvp, onm);
3267         dnlc_remove(ndvp, nnm);
3268 
3269         setdiropargs3(&args.from, onm, odvp);
3270         setdiropargs3(&args.to, nnm, ndvp);
3271 
3272         douprintf = 1;
3273 
3274         t = gethrtime();
3275 
3276         error = rfs3call(VTOMI(odvp), NFSPROC3_RENAME,
3277             xdr_RENAME3args, (caddr_t)&args,
3278             xdr_RENAME3res, (caddr_t)&res, cr,
3279             &douprintf, &res.status, 0, NULL);
3280 
3281         if (error) {
3282                 PURGE_ATTRCACHE(odvp);
3283                 PURGE_ATTRCACHE(ndvp);
3284                 VN_RELE(ovp);
3285                 nfs_rw_exit(&odrp->r_rwlock);
3286                 nfs_rw_exit(&ndrp->r_rwlock);
3287                 if (nvp) {
3288                         VN_RELE(nvp);
3289                 }
3290                 return (error);
3291         }
3292 
3293         error = geterrno3(res.status);
3294 
3295         if (!error) {
3296                 nfs3_cache_wcc_data(odvp, &res.resok.fromdir_wcc, t, cr);
3297                 if (HAVE_RDDIR_CACHE(odrp))
3298                         nfs_purge_rddir_cache(odvp);
3299                 if (ndvp != odvp) {
3300                         nfs3_cache_wcc_data(ndvp, &res.resok.todir_wcc, t, cr);
3301                         if (HAVE_RDDIR_CACHE(ndrp))
3302                                 nfs_purge_rddir_cache(ndvp);
3303                 }
3304                 /*
3305                  * when renaming directories to be a subdirectory of a
3306                  * different parent, the dnlc entry for ".." will no
3307                  * longer be valid, so it must be removed
3308                  */
3309                 rp = VTOR(ovp);
3310                 if (ndvp != odvp) {
3311                         if (ovp->v_type == VDIR) {
3312                                 dnlc_remove(ovp, "..");
3313                                 if (HAVE_RDDIR_CACHE(rp))
3314                                         nfs_purge_rddir_cache(ovp);
3315                         }
3316                 }
3317 
3318                 /*
3319                  * If we are renaming the unlinked file, update the
3320                  * r_unldvp and r_unlname as needed.
3321                  */
3322                 mutex_enter(&rp->r_statelock);
3323                 if (rp->r_unldvp != NULL) {
3324                         if (strcmp(rp->r_unlname, onm) == 0) {
3325                                 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
3326                                 rp->r_unlname[MAXNAMELEN - 1] = '\0';
3327 
3328                                 if (ndvp != rp->r_unldvp) {
3329                                         VN_RELE(rp->r_unldvp);
3330                                         rp->r_unldvp = ndvp;
3331                                         VN_HOLD(ndvp);
3332                                 }
3333                         }
3334                 }
3335                 mutex_exit(&rp->r_statelock);
3336         } else {
3337                 nfs3_cache_wcc_data(odvp, &res.resfail.fromdir_wcc, t, cr);
3338                 if (ndvp != odvp) {
3339                         nfs3_cache_wcc_data(ndvp, &res.resfail.todir_wcc, t,
3340                             cr);
3341                 }
3342                 /*
3343                  * System V defines rename to return EEXIST, not
3344                  * ENOTEMPTY if the target directory is not empty.
3345                  * Over the wire, the error is NFSERR_ENOTEMPTY
3346                  * which geterrno maps to ENOTEMPTY.
3347                  */
3348                 if (error == ENOTEMPTY)
3349                         error = EEXIST;
3350         }
3351 
3352         if (error == 0) {
3353                 if (nvp)
3354                         vnevent_rename_dest(nvp, ndvp, nnm, ct);
3355 
3356                 if (odvp != ndvp)
3357                         vnevent_rename_dest_dir(ndvp, ct);
3358                 ASSERT(ovp != NULL);
3359                 vnevent_rename_src(ovp, odvp, onm, ct);
3360         }
3361 
3362         if (nvp) {
3363                 VN_RELE(nvp);
3364         }
3365         VN_RELE(ovp);
3366 
3367         nfs_rw_exit(&odrp->r_rwlock);
3368         nfs_rw_exit(&ndrp->r_rwlock);
3369 
3370         return (error);
3371 }
3372 
3373 /* ARGSUSED */
3374 static int
3375 nfs3_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
3376         caller_context_t *ct, int flags, vsecattr_t *vsecp)
3377 {
3378         int error;
3379         MKDIR3args args;
3380         MKDIR3res res;
3381         int douprintf;
3382         struct vattr vattr;
3383         vnode_t *vp;
3384         rnode_t *drp;
3385         hrtime_t t;
3386 
3387         if (nfs_zone() != VTOMI(dvp)->mi_zone)
3388                 return (EPERM);
3389         setdiropargs3(&args.where, nm, dvp);
3390 
3391         /*
3392          * Decide what the group-id and set-gid bit of the created directory
3393          * should be.  May have to do a setattr to get the gid right.
3394          */
3395         error = setdirgid(dvp, &va->va_gid, cr);
3396         if (error)
3397                 return (error);
3398         error = setdirmode(dvp, &va->va_mode, cr);
3399         if (error)
3400                 return (error);
3401         va->va_mask |= AT_MODE|AT_GID;
3402 
3403         error = vattr_to_sattr3(va, &args.attributes);
3404         if (error) {
3405                 /* req time field(s) overflow - return immediately */
3406                 return (error);
3407         }
3408 
3409         drp = VTOR(dvp);
3410         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3411                 return (EINTR);
3412 
3413         dnlc_remove(dvp, nm);
3414 
3415         douprintf = 1;
3416 
3417         t = gethrtime();
3418 
3419         error = rfs3call(VTOMI(dvp), NFSPROC3_MKDIR,
3420             xdr_MKDIR3args, (caddr_t)&args,
3421             xdr_MKDIR3res, (caddr_t)&res, cr,
3422             &douprintf, &res.status, 0, NULL);
3423 
3424         if (error) {
3425                 PURGE_ATTRCACHE(dvp);
3426                 nfs_rw_exit(&drp->r_rwlock);
3427                 return (error);
3428         }
3429 
3430         error = geterrno3(res.status);
3431         if (!error) {
3432                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3433                 if (HAVE_RDDIR_CACHE(drp))
3434                         nfs_purge_rddir_cache(dvp);
3435 
3436                 if (!res.resok.obj.handle_follows) {
3437                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3438                         if (error) {
3439                                 nfs_rw_exit(&drp->r_rwlock);
3440                                 return (error);
3441                         }
3442                 } else {
3443                         if (res.resok.obj_attributes.attributes) {
3444                                 vp = makenfs3node(&res.resok.obj.handle,
3445                                     &res.resok.obj_attributes.attr,
3446                                     dvp->v_vfsp, t, cr, NULL, NULL);
3447                         } else {
3448                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
3449                                     dvp->v_vfsp, t, cr, NULL, NULL);
3450                                 if (vp->v_type == VNON) {
3451                                         vattr.va_mask = AT_TYPE;
3452                                         error = nfs3getattr(vp, &vattr, cr);
3453                                         if (error) {
3454                                                 VN_RELE(vp);
3455                                                 nfs_rw_exit(&drp->r_rwlock);
3456                                                 return (error);
3457                                         }
3458                                         vp->v_type = vattr.va_type;
3459                                 }
3460                         }
3461                         dnlc_update(dvp, nm, vp);
3462                 }
3463                 if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
3464                         va->va_mask = AT_GID;
3465                         (void) nfs3setattr(vp, va, 0, cr);
3466                 }
3467                 *vpp = vp;
3468         } else {
3469                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3470                 PURGE_STALE_FH(error, dvp, cr);
3471         }
3472 
3473         nfs_rw_exit(&drp->r_rwlock);
3474 
3475         return (error);
3476 }
3477 
3478 /* ARGSUSED */
3479 static int
3480 nfs3_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
3481         caller_context_t *ct, int flags)
3482 {
3483         int error;
3484         RMDIR3args args;
3485         RMDIR3res res;
3486         vnode_t *vp;
3487         int douprintf;
3488         rnode_t *drp;
3489         hrtime_t t;
3490 
3491         if (nfs_zone() != VTOMI(dvp)->mi_zone)
3492                 return (EPERM);
3493         drp = VTOR(dvp);
3494         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3495                 return (EINTR);
3496 
3497         /*
3498          * Attempt to prevent a rmdir(".") from succeeding.
3499          */
3500         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3501         if (error) {
3502                 nfs_rw_exit(&drp->r_rwlock);
3503                 return (error);
3504         }
3505 
3506         if (vp == cdir) {
3507                 VN_RELE(vp);
3508                 nfs_rw_exit(&drp->r_rwlock);
3509                 return (EINVAL);
3510         }
3511 
3512         setdiropargs3(&args.object, nm, dvp);
3513 
3514         /*
3515          * First just remove the entry from the name cache, as it
3516          * is most likely an entry for this vp.
3517          */
3518         dnlc_remove(dvp, nm);
3519 
3520         /*
3521          * If there vnode reference count is greater than one, then
3522          * there may be additional references in the DNLC which will
3523          * need to be purged.  First, trying removing the entry for
3524          * the parent directory and see if that removes the additional
3525          * reference(s).  If that doesn't do it, then use dnlc_purge_vp
3526          * to completely remove any references to the directory which
3527          * might still exist in the DNLC.
3528          */
3529         if (vp->v_count > 1) {
3530                 dnlc_remove(vp, "..");
3531                 if (vp->v_count > 1)
3532                         dnlc_purge_vp(vp);
3533         }
3534 
3535         douprintf = 1;
3536 
3537         t = gethrtime();
3538 
3539         error = rfs3call(VTOMI(dvp), NFSPROC3_RMDIR,
3540             xdr_diropargs3, (caddr_t)&args,
3541             xdr_RMDIR3res, (caddr_t)&res, cr,
3542             &douprintf, &res.status, 0, NULL);
3543 
3544         PURGE_ATTRCACHE(vp);
3545 
3546         if (error) {
3547                 PURGE_ATTRCACHE(dvp);
3548                 VN_RELE(vp);
3549                 nfs_rw_exit(&drp->r_rwlock);
3550                 return (error);
3551         }
3552 
3553         error = geterrno3(res.status);
3554         if (!error) {
3555                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3556                 if (HAVE_RDDIR_CACHE(drp))
3557                         nfs_purge_rddir_cache(dvp);
3558                 if (HAVE_RDDIR_CACHE(VTOR(vp)))
3559                         nfs_purge_rddir_cache(vp);
3560         } else {
3561                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3562                 PURGE_STALE_FH(error, dvp, cr);
3563                 /*
3564                  * System V defines rmdir to return EEXIST, not
3565                  * ENOTEMPTY if the directory is not empty.  Over
3566                  * the wire, the error is NFSERR_ENOTEMPTY which
3567                  * geterrno maps to ENOTEMPTY.
3568                  */
3569                 if (error == ENOTEMPTY)
3570                         error = EEXIST;
3571         }
3572 
3573         if (error == 0) {
3574                 vnevent_rmdir(vp, dvp, nm, ct);
3575         }
3576         VN_RELE(vp);
3577 
3578         nfs_rw_exit(&drp->r_rwlock);
3579 
3580         return (error);
3581 }
3582 
3583 /* ARGSUSED */
3584 static int
3585 nfs3_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
3586         caller_context_t *ct, int flags)
3587 {
3588         int error;
3589         SYMLINK3args args;
3590         SYMLINK3res res;
3591         int douprintf;
3592         mntinfo_t *mi;
3593         vnode_t *vp;
3594         rnode_t *rp;
3595         char *contents;
3596         rnode_t *drp;
3597         hrtime_t t;
3598 
3599         mi = VTOMI(dvp);
3600 
3601         if (nfs_zone() != mi->mi_zone)
3602                 return (EPERM);
3603         if (!(mi->mi_flags & MI_SYMLINK))
3604                 return (EOPNOTSUPP);
3605 
3606         setdiropargs3(&args.where, lnm, dvp);
3607         error = vattr_to_sattr3(tva, &args.symlink.symlink_attributes);
3608         if (error) {
3609                 /* req time field(s) overflow - return immediately */
3610                 return (error);
3611         }
3612         args.symlink.symlink_data = tnm;
3613 
3614         drp = VTOR(dvp);
3615         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3616                 return (EINTR);
3617 
3618         dnlc_remove(dvp, lnm);
3619 
3620         douprintf = 1;
3621 
3622         t = gethrtime();
3623 
3624         error = rfs3call(mi, NFSPROC3_SYMLINK,
3625             xdr_SYMLINK3args, (caddr_t)&args,
3626             xdr_SYMLINK3res, (caddr_t)&res, cr,
3627             &douprintf, &res.status, 0, NULL);
3628 
3629         if (error) {
3630                 PURGE_ATTRCACHE(dvp);
3631                 nfs_rw_exit(&drp->r_rwlock);
3632                 return (error);
3633         }
3634 
3635         error = geterrno3(res.status);
3636         if (!error) {
3637                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3638                 if (HAVE_RDDIR_CACHE(drp))
3639                         nfs_purge_rddir_cache(dvp);
3640 
3641                 if (res.resok.obj.handle_follows) {
3642                         if (res.resok.obj_attributes.attributes) {
3643                                 vp = makenfs3node(&res.resok.obj.handle,
3644                                     &res.resok.obj_attributes.attr,
3645                                     dvp->v_vfsp, t, cr, NULL, NULL);
3646                         } else {
3647                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
3648                                     dvp->v_vfsp, t, cr, NULL, NULL);
3649                                 vp->v_type = VLNK;
3650                                 vp->v_rdev = 0;
3651                         }
3652                         dnlc_update(dvp, lnm, vp);
3653                         rp = VTOR(vp);
3654                         if (nfs3_do_symlink_cache &&
3655                             rp->r_symlink.contents == NULL) {
3656 
3657                                 contents = kmem_alloc(MAXPATHLEN,
3658                                     KM_NOSLEEP);
3659 
3660                                 if (contents != NULL) {
3661                                         mutex_enter(&rp->r_statelock);
3662                                         if (rp->r_symlink.contents == NULL) {
3663                                                 rp->r_symlink.len = strlen(tnm);
3664                                                 bcopy(tnm, contents,
3665                                                     rp->r_symlink.len);
3666                                                 rp->r_symlink.contents =
3667                                                     contents;
3668                                                 rp->r_symlink.size = MAXPATHLEN;
3669                                                 mutex_exit(&rp->r_statelock);
3670                                         } else {
3671                                                 mutex_exit(&rp->r_statelock);
3672                                                 kmem_free((void *)contents,
3673                                                     MAXPATHLEN);
3674                                         }
3675                                 }
3676                         }
3677                         VN_RELE(vp);
3678                 }
3679         } else {
3680                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3681                 PURGE_STALE_FH(error, dvp, cr);
3682                 if (error == EOPNOTSUPP) {
3683                         mutex_enter(&mi->mi_lock);
3684                         mi->mi_flags &= ~MI_SYMLINK;
3685                         mutex_exit(&mi->mi_lock);
3686                 }
3687         }
3688 
3689         nfs_rw_exit(&drp->r_rwlock);
3690 
3691         return (error);
3692 }
3693 
3694 #ifdef DEBUG
3695 static int nfs3_readdir_cache_hits = 0;
3696 static int nfs3_readdir_cache_shorts = 0;
3697 static int nfs3_readdir_cache_waits = 0;
3698 static int nfs3_readdir_cache_misses = 0;
3699 static int nfs3_readdir_readahead = 0;
3700 #endif
3701 
3702 static int nfs3_shrinkreaddir = 0;
3703 
3704 /*
3705  * Read directory entries.
3706  * There are some weird things to look out for here.  The uio_loffset
3707  * field is either 0 or it is the offset returned from a previous
3708  * readdir.  It is an opaque value used by the server to find the
3709  * correct directory block to read. The count field is the number
3710  * of blocks to read on the server.  This is advisory only, the server
3711  * may return only one block's worth of entries.  Entries may be compressed
3712  * on the server.
3713  */
3714 /* ARGSUSED */
3715 static int
3716 nfs3_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
3717         caller_context_t *ct, int flags)
3718 {
3719         int error;
3720         size_t count;
3721         rnode_t *rp;
3722         rddir_cache *rdc;
3723         rddir_cache *nrdc;
3724         rddir_cache *rrdc;
3725 #ifdef DEBUG
3726         int missed;
3727 #endif
3728         int doreadahead;
3729         rddir_cache srdc;
3730         avl_index_t where;
3731 
3732         if (nfs_zone() != VTOMI(vp)->mi_zone)
3733                 return (EIO);
3734         rp = VTOR(vp);
3735 
3736         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
3737 
3738         /*
3739          * Make sure that the directory cache is valid.
3740          */
3741         if (HAVE_RDDIR_CACHE(rp)) {
3742                 if (nfs_disable_rddir_cache) {
3743                         /*
3744                          * Setting nfs_disable_rddir_cache in /etc/system
3745                          * allows interoperability with servers that do not
3746                          * properly update the attributes of directories.
3747                          * Any cached information gets purged before an
3748                          * access is made to it.
3749                          */
3750                         nfs_purge_rddir_cache(vp);
3751                 } else {
3752                         error = nfs3_validate_caches(vp, cr);
3753                         if (error)
3754                                 return (error);
3755                 }
3756         }
3757 
3758         /*
3759          * It is possible that some servers may not be able to correctly
3760          * handle a large READDIR or READDIRPLUS request due to bugs in
3761          * their implementation.  In order to continue to interoperate
3762          * with them, this workaround is provided to limit the maximum
3763          * size of a READDIRPLUS request to 1024.  In any case, the request
3764          * size is limited to MAXBSIZE.
3765          */
3766         count = MIN(uiop->uio_iov->iov_len,
3767             nfs3_shrinkreaddir ? 1024 : MAXBSIZE);
3768 
3769         nrdc = NULL;
3770 #ifdef DEBUG
3771         missed = 0;
3772 #endif
3773 top:
3774         /*
3775          * Short circuit last readdir which always returns 0 bytes.
3776          * This can be done after the directory has been read through
3777          * completely at least once.  This will set r_direof which
3778          * can be used to find the value of the last cookie.
3779          */
3780         mutex_enter(&rp->r_statelock);
3781         if (rp->r_direof != NULL &&
3782             uiop->uio_loffset == rp->r_direof->nfs3_ncookie) {
3783                 mutex_exit(&rp->r_statelock);
3784 #ifdef DEBUG
3785                 nfs3_readdir_cache_shorts++;
3786 #endif
3787                 if (eofp)
3788                         *eofp = 1;
3789                 if (nrdc != NULL)
3790                         rddir_cache_rele(nrdc);
3791                 return (0);
3792         }
3793         /*
3794          * Look for a cache entry.  Cache entries are identified
3795          * by the NFS cookie value and the byte count requested.
3796          */
3797         srdc.nfs3_cookie = uiop->uio_loffset;
3798         srdc.buflen = count;
3799         rdc = avl_find(&rp->r_dir, &srdc, &where);
3800         if (rdc != NULL) {
3801                 rddir_cache_hold(rdc);
3802                 /*
3803                  * If the cache entry is in the process of being
3804                  * filled in, wait until this completes.  The
3805                  * RDDIRWAIT bit is set to indicate that someone
3806                  * is waiting and then the thread currently
3807                  * filling the entry is done, it should do a
3808                  * cv_broadcast to wakeup all of the threads
3809                  * waiting for it to finish.
3810                  */
3811                 if (rdc->flags & RDDIR) {
3812                         nfs_rw_exit(&rp->r_rwlock);
3813                         rdc->flags |= RDDIRWAIT;
3814 #ifdef DEBUG
3815                         nfs3_readdir_cache_waits++;
3816 #endif
3817                         if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3818                                 /*
3819                                  * We got interrupted, probably
3820                                  * the user typed ^C or an alarm
3821                                  * fired.  We free the new entry
3822                                  * if we allocated one.
3823                                  */
3824                                 mutex_exit(&rp->r_statelock);
3825                                 (void) nfs_rw_enter_sig(&rp->r_rwlock,
3826                                     RW_READER, FALSE);
3827                                 rddir_cache_rele(rdc);
3828                                 if (nrdc != NULL)
3829                                         rddir_cache_rele(nrdc);
3830                                 return (EINTR);
3831                         }
3832                         mutex_exit(&rp->r_statelock);
3833                         (void) nfs_rw_enter_sig(&rp->r_rwlock,
3834                             RW_READER, FALSE);
3835                         rddir_cache_rele(rdc);
3836                         goto top;
3837                 }
3838                 /*
3839                  * Check to see if a readdir is required to
3840                  * fill the entry.  If so, mark this entry
3841                  * as being filled, remove our reference,
3842                  * and branch to the code to fill the entry.
3843                  */
3844                 if (rdc->flags & RDDIRREQ) {
3845                         rdc->flags &= ~RDDIRREQ;
3846                         rdc->flags |= RDDIR;
3847                         if (nrdc != NULL)
3848                                 rddir_cache_rele(nrdc);
3849                         nrdc = rdc;
3850                         mutex_exit(&rp->r_statelock);
3851                         goto bottom;
3852                 }
3853 #ifdef DEBUG
3854                 if (!missed)
3855                         nfs3_readdir_cache_hits++;
3856 #endif
3857                 /*
3858                  * If an error occurred while attempting
3859                  * to fill the cache entry, just return it.
3860                  */
3861                 if (rdc->error) {
3862                         error = rdc->error;
3863                         mutex_exit(&rp->r_statelock);
3864                         rddir_cache_rele(rdc);
3865                         if (nrdc != NULL)
3866                                 rddir_cache_rele(nrdc);
3867                         return (error);
3868                 }
3869 
3870                 /*
3871                  * The cache entry is complete and good,
3872                  * copyout the dirent structs to the calling
3873                  * thread.
3874                  */
3875                 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3876 
3877                 /*
3878                  * If no error occurred during the copyout,
3879                  * update the offset in the uio struct to
3880                  * contain the value of the next cookie
3881                  * and set the eof value appropriately.
3882                  */
3883                 if (!error) {
3884                         uiop->uio_loffset = rdc->nfs3_ncookie;
3885                         if (eofp)
3886                                 *eofp = rdc->eof;
3887                 }
3888 
3889                 /*
3890                  * Decide whether to do readahead.
3891                  *
3892                  * Don't if have already read to the end of
3893                  * directory.  There is nothing more to read.
3894                  *
3895                  * Don't if the application is not doing
3896                  * lookups in the directory.  The readahead
3897                  * is only effective if the application can
3898                  * be doing work while an async thread is
3899                  * handling the over the wire request.
3900                  */
3901                 if (rdc->eof) {
3902                         rp->r_direof = rdc;
3903                         doreadahead = FALSE;
3904                 } else if (!(rp->r_flags & RLOOKUP))
3905                         doreadahead = FALSE;
3906                 else
3907                         doreadahead = TRUE;
3908 
3909                 if (!doreadahead) {
3910                         mutex_exit(&rp->r_statelock);
3911                         rddir_cache_rele(rdc);
3912                         if (nrdc != NULL)
3913                                 rddir_cache_rele(nrdc);
3914                         return (error);
3915                 }
3916 
3917                 /*
3918                  * Check to see whether we found an entry
3919                  * for the readahead.  If so, we don't need
3920                  * to do anything further, so free the new
3921                  * entry if one was allocated.  Otherwise,
3922                  * allocate a new entry, add it to the cache,
3923                  * and then initiate an asynchronous readdir
3924                  * operation to fill it.
3925                  */
3926                 srdc.nfs3_cookie = rdc->nfs3_ncookie;
3927                 srdc.buflen = count;
3928                 rrdc = avl_find(&rp->r_dir, &srdc, &where);
3929                 if (rrdc != NULL) {
3930                         if (nrdc != NULL)
3931                                 rddir_cache_rele(nrdc);
3932                 } else {
3933                         if (nrdc != NULL)
3934                                 rrdc = nrdc;
3935                         else {
3936                                 rrdc = rddir_cache_alloc(KM_NOSLEEP);
3937                         }
3938                         if (rrdc != NULL) {
3939                                 rrdc->nfs3_cookie = rdc->nfs3_ncookie;
3940                                 rrdc->buflen = count;
3941                                 avl_insert(&rp->r_dir, rrdc, where);
3942                                 rddir_cache_hold(rrdc);
3943                                 mutex_exit(&rp->r_statelock);
3944                                 rddir_cache_rele(rdc);
3945 #ifdef DEBUG
3946                                 nfs3_readdir_readahead++;
3947 #endif
3948                                 nfs_async_readdir(vp, rrdc, cr, do_nfs3readdir);
3949                                 return (error);
3950                         }
3951                 }
3952 
3953                 mutex_exit(&rp->r_statelock);
3954                 rddir_cache_rele(rdc);
3955                 return (error);
3956         }
3957 
3958         /*
3959          * Didn't find an entry in the cache.  Construct a new empty
3960          * entry and link it into the cache.  Other processes attempting
3961          * to access this entry will need to wait until it is filled in.
3962          *
3963          * Since kmem_alloc may block, another pass through the cache
3964          * will need to be taken to make sure that another process
3965          * hasn't already added an entry to the cache for this request.
3966          */
3967         if (nrdc == NULL) {
3968                 mutex_exit(&rp->r_statelock);
3969                 nrdc = rddir_cache_alloc(KM_SLEEP);
3970                 nrdc->nfs3_cookie = uiop->uio_loffset;
3971                 nrdc->buflen = count;
3972                 goto top;
3973         }
3974 
3975         /*
3976          * Add this entry to the cache.
3977          */
3978         avl_insert(&rp->r_dir, nrdc, where);
3979         rddir_cache_hold(nrdc);
3980         mutex_exit(&rp->r_statelock);
3981 
3982 bottom:
3983 #ifdef DEBUG
3984         missed = 1;
3985         nfs3_readdir_cache_misses++;
3986 #endif
3987         /*
3988          * Do the readdir.  This routine decides whether to use
3989          * READDIR or READDIRPLUS.
3990          */
3991         error = do_nfs3readdir(vp, nrdc, cr);
3992 
3993         /*
3994          * If this operation failed, just return the error which occurred.
3995          */
3996         if (error != 0)
3997                 return (error);
3998 
3999         /*
4000          * Since the RPC operation will have taken sometime and blocked
4001          * this process, another pass through the cache will need to be
4002          * taken to find the correct cache entry.  It is possible that
4003          * the correct cache entry will not be there (although one was
4004          * added) because the directory changed during the RPC operation
4005          * and the readdir cache was flushed.  In this case, just start
4006          * over.  It is hoped that this will not happen too often... :-)
4007          */
4008         nrdc = NULL;
4009         goto top;
4010         /* NOTREACHED */
4011 }
4012 
4013 static int
4014 do_nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4015 {
4016         int error;
4017         rnode_t *rp;
4018         mntinfo_t *mi;
4019 
4020         rp = VTOR(vp);
4021         mi = VTOMI(vp);
4022         ASSERT(nfs_zone() == mi->mi_zone);
4023         /*
4024          * Issue the proper request.
4025          *
4026          * If the server does not support READDIRPLUS, then use READDIR.
4027          *
4028          * Otherwise --
4029          * Issue a READDIRPLUS if reading to fill an empty cache or if
4030          * an application has performed a lookup in the directory which
4031          * required an over the wire lookup.  The use of READDIRPLUS
4032          * will help to (re)populate the DNLC.
4033          */
4034         if (!(mi->mi_flags & MI_READDIRONLY) &&
4035             (rp->r_flags & (RLOOKUP | RREADDIRPLUS))) {
4036                 if (rp->r_flags & RREADDIRPLUS) {
4037                         mutex_enter(&rp->r_statelock);
4038                         rp->r_flags &= ~RREADDIRPLUS;
4039                         mutex_exit(&rp->r_statelock);
4040                 }
4041                 nfs3readdirplus(vp, rdc, cr);
4042                 if (rdc->error == EOPNOTSUPP)
4043                         nfs3readdir(vp, rdc, cr);
4044         } else
4045                 nfs3readdir(vp, rdc, cr);
4046 
4047         mutex_enter(&rp->r_statelock);
4048         rdc->flags &= ~RDDIR;
4049         if (rdc->flags & RDDIRWAIT) {
4050                 rdc->flags &= ~RDDIRWAIT;
4051                 cv_broadcast(&rdc->cv);
4052         }
4053         error = rdc->error;
4054         if (error)
4055                 rdc->flags |= RDDIRREQ;
4056         mutex_exit(&rp->r_statelock);
4057 
4058         rddir_cache_rele(rdc);
4059 
4060         return (error);
4061 }
4062 
4063 static void
4064 nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4065 {
4066         int error;
4067         READDIR3args args;
4068         READDIR3vres res;
4069         vattr_t dva;
4070         rnode_t *rp;
4071         int douprintf;
4072         failinfo_t fi, *fip = NULL;
4073         mntinfo_t *mi;
4074         hrtime_t t;
4075 
4076         rp = VTOR(vp);
4077         mi = VTOMI(vp);
4078         ASSERT(nfs_zone() == mi->mi_zone);
4079 
4080         args.dir = *RTOFH3(rp);
4081         args.cookie = (cookie3)rdc->nfs3_cookie;
4082         args.cookieverf = rp->r_cookieverf;
4083         args.count = rdc->buflen;
4084 
4085         /*
4086          * NFS client failover support
4087          * suppress failover unless we have a zero cookie
4088          */
4089         if (args.cookie == (cookie3) 0) {
4090                 fi.vp = vp;
4091                 fi.fhp = (caddr_t)&args.dir;
4092                 fi.copyproc = nfs3copyfh;
4093                 fi.lookupproc = nfs3lookup;
4094                 fi.xattrdirproc = acl_getxattrdir3;
4095                 fip = &fi;
4096         }
4097 
4098 #ifdef DEBUG
4099         rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4100 #else
4101         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4102 #endif
4103 
4104         res.entries = (dirent64_t *)rdc->entries;
4105         res.entries_size = rdc->buflen;
4106         res.dir_attributes.fres.vap = &dva;
4107         res.dir_attributes.fres.vp = vp;
4108         res.loff = rdc->nfs3_cookie;
4109 
4110         douprintf = 1;
4111 
4112         if (mi->mi_io_kstats) {
4113                 mutex_enter(&mi->mi_lock);
4114                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4115                 mutex_exit(&mi->mi_lock);
4116         }
4117 
4118         t = gethrtime();
4119 
4120         error = rfs3call(VTOMI(vp), NFSPROC3_READDIR,
4121             xdr_READDIR3args, (caddr_t)&args,
4122             xdr_READDIR3vres, (caddr_t)&res, cr,
4123             &douprintf, &res.status, 0, fip);
4124 
4125         if (mi->mi_io_kstats) {
4126                 mutex_enter(&mi->mi_lock);
4127                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4128                 mutex_exit(&mi->mi_lock);
4129         }
4130 
4131         if (error)
4132                 goto err;
4133 
4134         nfs3_cache_post_op_vattr(vp, &res.dir_attributes, t, cr);
4135 
4136         error = geterrno3(res.status);
4137         if (error) {
4138                 PURGE_STALE_FH(error, vp, cr);
4139                 goto err;
4140         }
4141 
4142         if (mi->mi_io_kstats) {
4143                 mutex_enter(&mi->mi_lock);
4144                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4145                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4146                 mutex_exit(&mi->mi_lock);
4147         }
4148 
4149         rdc->nfs3_ncookie = res.loff;
4150         rp->r_cookieverf = res.cookieverf;
4151         rdc->eof = res.eof ? 1 : 0;
4152         rdc->entlen = res.size;
4153         ASSERT(rdc->entlen <= rdc->buflen);
4154         rdc->error = 0;
4155         return;
4156 
4157 err:
4158         kmem_free(rdc->entries, rdc->buflen);
4159         rdc->entries = NULL;
4160         rdc->error = error;
4161 }
4162 
4163 /*
4164  * Read directory entries.
4165  * There are some weird things to look out for here.  The uio_loffset
4166  * field is either 0 or it is the offset returned from a previous
4167  * readdir.  It is an opaque value used by the server to find the
4168  * correct directory block to read. The count field is the number
4169  * of blocks to read on the server.  This is advisory only, the server
4170  * may return only one block's worth of entries.  Entries may be compressed
4171  * on the server.
4172  */
4173 static void
4174 nfs3readdirplus(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4175 {
4176         int error;
4177         READDIRPLUS3args args;
4178         READDIRPLUS3vres res;
4179         vattr_t dva;
4180         rnode_t *rp;
4181         mntinfo_t *mi;
4182         int douprintf;
4183         failinfo_t fi, *fip = NULL;
4184 
4185         rp = VTOR(vp);
4186         mi = VTOMI(vp);
4187         ASSERT(nfs_zone() == mi->mi_zone);
4188 
4189         args.dir = *RTOFH3(rp);
4190         args.cookie = (cookie3)rdc->nfs3_cookie;
4191         args.cookieverf = rp->r_cookieverf;
4192         args.dircount = rdc->buflen;
4193         args.maxcount = mi->mi_tsize;
4194 
4195         /*
4196          * NFS client failover support
4197          * suppress failover unless we have a zero cookie
4198          */
4199         if (args.cookie == (cookie3)0) {
4200                 fi.vp = vp;
4201                 fi.fhp = (caddr_t)&args.dir;
4202                 fi.copyproc = nfs3copyfh;
4203                 fi.lookupproc = nfs3lookup;
4204                 fi.xattrdirproc = acl_getxattrdir3;
4205                 fip = &fi;
4206         }
4207 
4208 #ifdef DEBUG
4209         rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4210 #else
4211         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4212 #endif
4213 
4214         res.entries = (dirent64_t *)rdc->entries;
4215         res.entries_size = rdc->buflen;
4216         res.dir_attributes.fres.vap = &dva;
4217         res.dir_attributes.fres.vp = vp;
4218         res.loff = rdc->nfs3_cookie;
4219         res.credentials = cr;
4220 
4221         douprintf = 1;
4222 
4223         if (mi->mi_io_kstats) {
4224                 mutex_enter(&mi->mi_lock);
4225                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4226                 mutex_exit(&mi->mi_lock);
4227         }
4228 
4229         res.time = gethrtime();
4230 
4231         error = rfs3call(mi, NFSPROC3_READDIRPLUS,
4232             xdr_READDIRPLUS3args, (caddr_t)&args,
4233             xdr_READDIRPLUS3vres, (caddr_t)&res, cr,
4234             &douprintf, &res.status, 0, fip);
4235 
4236         if (mi->mi_io_kstats) {
4237                 mutex_enter(&mi->mi_lock);
4238                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4239                 mutex_exit(&mi->mi_lock);
4240         }
4241 
4242         if (error) {
4243                 goto err;
4244         }
4245 
4246         nfs3_cache_post_op_vattr(vp, &res.dir_attributes, res.time, cr);
4247 
4248         error = geterrno3(res.status);
4249         if (error) {
4250                 PURGE_STALE_FH(error, vp, cr);
4251                 if (error == EOPNOTSUPP) {
4252                         mutex_enter(&mi->mi_lock);
4253                         mi->mi_flags |= MI_READDIRONLY;
4254                         mutex_exit(&mi->mi_lock);
4255                 }
4256                 goto err;
4257         }
4258 
4259         if (mi->mi_io_kstats) {
4260                 mutex_enter(&mi->mi_lock);
4261                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4262                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4263                 mutex_exit(&mi->mi_lock);
4264         }
4265 
4266         rdc->nfs3_ncookie = res.loff;
4267         rp->r_cookieverf = res.cookieverf;
4268         rdc->eof = res.eof ? 1 : 0;
4269         rdc->entlen = res.size;
4270         ASSERT(rdc->entlen <= rdc->buflen);
4271         rdc->error = 0;
4272 
4273         return;
4274 
4275 err:
4276         kmem_free(rdc->entries, rdc->buflen);
4277         rdc->entries = NULL;
4278         rdc->error = error;
4279 }
4280 
4281 #ifdef DEBUG
4282 static int nfs3_bio_do_stop = 0;
4283 #endif
4284 
4285 static int
4286 nfs3_bio(struct buf *bp, stable_how *stab_comm, cred_t *cr)
4287 {
4288         rnode_t *rp = VTOR(bp->b_vp);
4289         int count;
4290         int error;
4291         cred_t *cred;
4292         offset_t offset;
4293 
4294         ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
4295         offset = ldbtob(bp->b_lblkno);
4296 
4297         DTRACE_IO1(start, struct buf *, bp);
4298 
4299         if (bp->b_flags & B_READ) {
4300                 mutex_enter(&rp->r_statelock);
4301                 if (rp->r_cred != NULL) {
4302                         cred = rp->r_cred;
4303                         crhold(cred);
4304                 } else {
4305                         rp->r_cred = cr;
4306                         crhold(cr);
4307                         cred = cr;
4308                         crhold(cred);
4309                 }
4310                 mutex_exit(&rp->r_statelock);
4311         read_again:
4312                 error = bp->b_error = nfs3read(bp->b_vp, bp->b_un.b_addr,
4313                     offset, bp->b_bcount, &bp->b_resid, cred);
4314                 crfree(cred);
4315                 if (!error) {
4316                         if (bp->b_resid) {
4317                                 /*
4318                                  * Didn't get it all because we hit EOF,
4319                                  * zero all the memory beyond the EOF.
4320                                  */
4321                                 /* bzero(rdaddr + */
4322                                 bzero(bp->b_un.b_addr +
4323                                     bp->b_bcount - bp->b_resid, bp->b_resid);
4324                         }
4325                         mutex_enter(&rp->r_statelock);
4326                         if (bp->b_resid == bp->b_bcount &&
4327                             offset >= rp->r_size) {
4328                                 /*
4329                                  * We didn't read anything at all as we are
4330                                  * past EOF.  Return an error indicator back
4331                                  * but don't destroy the pages (yet).
4332                                  */
4333                                 error = NFS_EOF;
4334                         }
4335                         mutex_exit(&rp->r_statelock);
4336                 } else if (error == EACCES) {
4337                         mutex_enter(&rp->r_statelock);
4338                         if (cred != cr) {
4339                                 if (rp->r_cred != NULL)
4340                                         crfree(rp->r_cred);
4341                                 rp->r_cred = cr;
4342                                 crhold(cr);
4343                                 cred = cr;
4344                                 crhold(cred);
4345                                 mutex_exit(&rp->r_statelock);
4346                                 goto read_again;
4347                         }
4348                         mutex_exit(&rp->r_statelock);
4349                 }
4350         } else {
4351                 if (!(rp->r_flags & RSTALE)) {
4352                         mutex_enter(&rp->r_statelock);
4353                         if (rp->r_cred != NULL) {
4354                                 cred = rp->r_cred;
4355                                 crhold(cred);
4356                         } else {
4357                                 rp->r_cred = cr;
4358                                 crhold(cr);
4359                                 cred = cr;
4360                                 crhold(cred);
4361                         }
4362                         mutex_exit(&rp->r_statelock);
4363                 write_again:
4364                         mutex_enter(&rp->r_statelock);
4365                         count = MIN(bp->b_bcount, rp->r_size - offset);
4366                         mutex_exit(&rp->r_statelock);
4367                         if (count < 0)
4368                                 cmn_err(CE_PANIC, "nfs3_bio: write count < 0");
4369 #ifdef DEBUG
4370                         if (count == 0) {
4371                                 zcmn_err(getzoneid(), CE_WARN,
4372                                     "nfs3_bio: zero length write at %lld",
4373                                     offset);
4374                                 nfs_printfhandle(&rp->r_fh);
4375                                 if (nfs3_bio_do_stop)
4376                                         debug_enter("nfs3_bio");
4377                         }
4378 #endif
4379                         error = nfs3write(bp->b_vp, bp->b_un.b_addr, offset,
4380                             count, cred, stab_comm);
4381                         if (error == EACCES) {
4382                                 mutex_enter(&rp->r_statelock);
4383                                 if (cred != cr) {
4384                                         if (rp->r_cred != NULL)
4385                                                 crfree(rp->r_cred);
4386                                         rp->r_cred = cr;
4387                                         crhold(cr);
4388                                         crfree(cred);
4389                                         cred = cr;
4390                                         crhold(cred);
4391                                         mutex_exit(&rp->r_statelock);
4392                                         goto write_again;
4393                                 }
4394                                 mutex_exit(&rp->r_statelock);
4395                         }
4396                         bp->b_error = error;
4397                         if (error && error != EINTR) {
4398                                 /*
4399                                  * Don't print EDQUOT errors on the console.
4400                                  * Don't print asynchronous EACCES errors.
4401                                  * Don't print EFBIG errors.
4402                                  * Print all other write errors.
4403                                  */
4404                                 if (error != EDQUOT && error != EFBIG &&
4405                                     (error != EACCES ||
4406                                     !(bp->b_flags & B_ASYNC)))
4407                                         nfs_write_error(bp->b_vp, error, cred);
4408                                 /*
4409                                  * Update r_error and r_flags as appropriate.
4410                                  * If the error was ESTALE, then mark the
4411                                  * rnode as not being writeable and save
4412                                  * the error status.  Otherwise, save any
4413                                  * errors which occur from asynchronous
4414                                  * page invalidations.  Any errors occurring
4415                                  * from other operations should be saved
4416                                  * by the caller.
4417                                  */
4418                                 mutex_enter(&rp->r_statelock);
4419                                 if (error == ESTALE) {
4420                                         rp->r_flags |= RSTALE;
4421                                         if (!rp->r_error)
4422                                                 rp->r_error = error;
4423                                 } else if (!rp->r_error &&
4424                                     (bp->b_flags &
4425                                     (B_INVAL|B_FORCE|B_ASYNC)) ==
4426                                     (B_INVAL|B_FORCE|B_ASYNC)) {
4427                                         rp->r_error = error;
4428                                 }
4429                                 mutex_exit(&rp->r_statelock);
4430                         }
4431                         crfree(cred);
4432                 } else {
4433                         error = rp->r_error;
4434                         /*
4435                          * A close may have cleared r_error, if so,
4436                          * propagate ESTALE error return properly
4437                          */
4438                         if (error == 0)
4439                                 error = ESTALE;
4440                 }
4441         }
4442 
4443         if (error != 0 && error != NFS_EOF)
4444                 bp->b_flags |= B_ERROR;
4445 
4446         DTRACE_IO1(done, struct buf *, bp);
4447 
4448         return (error);
4449 }
4450 
4451 /* ARGSUSED */
4452 static int
4453 nfs3_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4454 {
4455         rnode_t *rp;
4456 
4457         if (nfs_zone() != VTOMI(vp)->mi_zone)
4458                 return (EIO);
4459         rp = VTOR(vp);
4460 
4461         if (fidp->fid_len < (ushort_t)rp->r_fh.fh_len) {
4462                 fidp->fid_len = rp->r_fh.fh_len;
4463                 return (ENOSPC);
4464         }
4465         fidp->fid_len = rp->r_fh.fh_len;
4466         bcopy(rp->r_fh.fh_buf, fidp->fid_data, fidp->fid_len);
4467         return (0);
4468 }
4469 
4470 /* ARGSUSED2 */
4471 static int
4472 nfs3_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4473 {
4474         rnode_t *rp = VTOR(vp);
4475 
4476         if (!write_lock) {
4477                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4478                 return (V_WRITELOCK_FALSE);
4479         }
4480 
4481         if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
4482                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4483                 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
4484                         return (V_WRITELOCK_FALSE);
4485                 nfs_rw_exit(&rp->r_rwlock);
4486         }
4487 
4488         (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
4489         return (V_WRITELOCK_TRUE);
4490 }
4491 
4492 /* ARGSUSED */
4493 static void
4494 nfs3_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4495 {
4496         rnode_t *rp = VTOR(vp);
4497 
4498         nfs_rw_exit(&rp->r_rwlock);
4499 }
4500 
4501 /* ARGSUSED */
4502 static int
4503 nfs3_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
4504 {
4505 
4506         /*
4507          * Because we stuff the readdir cookie into the offset field
4508          * someone may attempt to do an lseek with the cookie which
4509          * we want to succeed.
4510          */
4511         if (vp->v_type == VDIR)
4512                 return (0);
4513         if (*noffp < 0)
4514                 return (EINVAL);
4515         return (0);
4516 }
4517 
4518 /*
4519  * number of nfs3_bsize blocks to read ahead.
4520  */
4521 static int nfs3_nra = 4;
4522 
4523 #ifdef DEBUG
4524 static int nfs3_lostpage = 0;   /* number of times we lost original page */
4525 #endif
4526 
4527 /*
4528  * Return all the pages from [off..off+len) in file
4529  */
4530 /* ARGSUSED */
4531 static int
4532 nfs3_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4533         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4534         enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4535 {
4536         rnode_t *rp;
4537         int error;
4538         mntinfo_t *mi;
4539 
4540         if (vp->v_flag & VNOMAP)
4541                 return (ENOSYS);
4542 
4543         if (nfs_zone() != VTOMI(vp)->mi_zone)
4544                 return (EIO);
4545         if (protp != NULL)
4546                 *protp = PROT_ALL;
4547 
4548         /*
4549          * Now valididate that the caches are up to date.
4550          */
4551         error = nfs3_validate_caches(vp, cr);
4552         if (error)
4553                 return (error);
4554 
4555         rp = VTOR(vp);
4556         mi = VTOMI(vp);
4557 retry:
4558         mutex_enter(&rp->r_statelock);
4559 
4560         /*
4561          * Don't create dirty pages faster than they
4562          * can be cleaned so that the system doesn't
4563          * get imbalanced.  If the async queue is
4564          * maxed out, then wait for it to drain before
4565          * creating more dirty pages.  Also, wait for
4566          * any threads doing pagewalks in the vop_getattr
4567          * entry points so that they don't block for
4568          * long periods.
4569          */
4570         if (rw == S_CREATE) {
4571                 while ((mi->mi_max_threads != 0 &&
4572                     rp->r_awcount > 2 * mi->mi_max_threads) ||
4573                     rp->r_gcount > 0)
4574                         cv_wait(&rp->r_cv, &rp->r_statelock);
4575         }
4576 
4577         /*
4578          * If we are getting called as a side effect of an nfs_write()
4579          * operation the local file size might not be extended yet.
4580          * In this case we want to be able to return pages of zeroes.
4581          */
4582         if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
4583                 mutex_exit(&rp->r_statelock);
4584                 return (EFAULT);                /* beyond EOF */
4585         }
4586 
4587         mutex_exit(&rp->r_statelock);
4588 
4589         error = pvn_getpages(nfs3_getapage, vp, off, len, protp,
4590             pl, plsz, seg, addr, rw, cr);
4591 
4592         switch (error) {
4593         case NFS_EOF:
4594                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
4595                 goto retry;
4596         case ESTALE:
4597                 PURGE_STALE_FH(error, vp, cr);
4598         }
4599 
4600         return (error);
4601 }
4602 
4603 /*
4604  * Called from pvn_getpages to get a particular page.
4605  */
4606 /* ARGSUSED */
4607 static int
4608 nfs3_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
4609         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4610         enum seg_rw rw, cred_t *cr)
4611 {
4612         rnode_t *rp;
4613         uint_t bsize;
4614         struct buf *bp;
4615         page_t *pp;
4616         u_offset_t lbn;
4617         u_offset_t io_off;
4618         u_offset_t blkoff;
4619         u_offset_t rablkoff;
4620         size_t io_len;
4621         uint_t blksize;
4622         int error;
4623         int readahead;
4624         int readahead_issued = 0;
4625         int ra_window; /* readahead window */
4626         page_t *pagefound;
4627         page_t *savepp;
4628 
4629         if (nfs_zone() != VTOMI(vp)->mi_zone)
4630                 return (EIO);
4631         rp = VTOR(vp);
4632         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4633 
4634 reread:
4635         bp = NULL;
4636         pp = NULL;
4637         pagefound = NULL;
4638 
4639         if (pl != NULL)
4640                 pl[0] = NULL;
4641 
4642         error = 0;
4643         lbn = off / bsize;
4644         blkoff = lbn * bsize;
4645 
4646         /*
4647          * Queueing up the readahead before doing the synchronous read
4648          * results in a significant increase in read throughput because
4649          * of the increased parallelism between the async threads and
4650          * the process context.
4651          */
4652         if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
4653             rw != S_CREATE &&
4654             !(vp->v_flag & VNOCACHE)) {
4655                 mutex_enter(&rp->r_statelock);
4656 
4657                 /*
4658                  * Calculate the number of readaheads to do.
4659                  * a) No readaheads at offset = 0.
4660                  * b) Do maximum(nfs3_nra) readaheads when the readahead
4661                  *    window is closed.
4662                  * c) Do readaheads between 1 to (nfs3_nra - 1) depending
4663                  *    upon how far the readahead window is open or close.
4664                  * d) No readaheads if rp->r_nextr is not within the scope
4665                  *    of the readahead window (random i/o).
4666                  */
4667 
4668                 if (off == 0)
4669                         readahead = 0;
4670                 else if (blkoff == rp->r_nextr)
4671                         readahead = nfs3_nra;
4672                 else if (rp->r_nextr > blkoff &&
4673                     ((ra_window = (rp->r_nextr - blkoff) / bsize)
4674                     <= (nfs3_nra - 1)))
4675                         readahead = nfs3_nra - ra_window;
4676                 else
4677                         readahead = 0;
4678 
4679                 rablkoff = rp->r_nextr;
4680                 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
4681                         mutex_exit(&rp->r_statelock);
4682                         if (nfs_async_readahead(vp, rablkoff + bsize,
4683                             addr + (rablkoff + bsize - off), seg, cr,
4684                             nfs3_readahead) < 0) {
4685                                 mutex_enter(&rp->r_statelock);
4686                                 break;
4687                         }
4688                         readahead--;
4689                         rablkoff += bsize;
4690                         /*
4691                          * Indicate that we did a readahead so
4692                          * readahead offset is not updated
4693                          * by the synchronous read below.
4694                          */
4695                         readahead_issued = 1;
4696                         mutex_enter(&rp->r_statelock);
4697                         /*
4698                          * set readahead offset to
4699                          * offset of last async readahead
4700                          * request.
4701                          */
4702                         rp->r_nextr = rablkoff;
4703                 }
4704                 mutex_exit(&rp->r_statelock);
4705         }
4706 
4707 again:
4708         if ((pagefound = page_exists(vp, off)) == NULL) {
4709                 if (pl == NULL) {
4710                         (void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
4711                             nfs3_readahead);
4712                 } else if (rw == S_CREATE) {
4713                         /*
4714                          * Block for this page is not allocated, or the offset
4715                          * is beyond the current allocation size, or we're
4716                          * allocating a swap slot and the page was not found,
4717                          * so allocate it and return a zero page.
4718                          */
4719                         if ((pp = page_create_va(vp, off,
4720                             PAGESIZE, PG_WAIT, seg, addr)) == NULL)
4721                                 cmn_err(CE_PANIC, "nfs3_getapage: page_create");
4722                         io_len = PAGESIZE;
4723                         mutex_enter(&rp->r_statelock);
4724                         rp->r_nextr = off + PAGESIZE;
4725                         mutex_exit(&rp->r_statelock);
4726                 } else {
4727                         /*
4728                          * Need to go to server to get a BLOCK, exception to
4729                          * that being while reading at offset = 0 or doing
4730                          * random i/o, in that case read only a PAGE.
4731                          */
4732                         mutex_enter(&rp->r_statelock);
4733                         if (blkoff < rp->r_size &&
4734                             blkoff + bsize >= rp->r_size) {
4735                                 /*
4736                                  * If only a block or less is left in
4737                                  * the file, read all that is remaining.
4738                                  */
4739                                 if (rp->r_size <= off) {
4740                                         /*
4741                                          * Trying to access beyond EOF,
4742                                          * set up to get at least one page.
4743                                          */
4744                                         blksize = off + PAGESIZE - blkoff;
4745                                 } else
4746                                         blksize = rp->r_size - blkoff;
4747                         } else if ((off == 0) ||
4748                             (off != rp->r_nextr && !readahead_issued)) {
4749                                 blksize = PAGESIZE;
4750                                 blkoff = off; /* block = page here */
4751                         } else
4752                                 blksize = bsize;
4753                         mutex_exit(&rp->r_statelock);
4754 
4755                         pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4756                             &io_len, blkoff, blksize, 0);
4757 
4758                         /*
4759                          * Some other thread has entered the page,
4760                          * so just use it.
4761                          */
4762                         if (pp == NULL)
4763                                 goto again;
4764 
4765                         /*
4766                          * Now round the request size up to page boundaries.
4767                          * This ensures that the entire page will be
4768                          * initialized to zeroes if EOF is encountered.
4769                          */
4770                         io_len = ptob(btopr(io_len));
4771 
4772                         bp = pageio_setup(pp, io_len, vp, B_READ);
4773                         ASSERT(bp != NULL);
4774 
4775                         /*
4776                          * pageio_setup should have set b_addr to 0.  This
4777                          * is correct since we want to do I/O on a page
4778                          * boundary.  bp_mapin will use this addr to calculate
4779                          * an offset, and then set b_addr to the kernel virtual
4780                          * address it allocated for us.
4781                          */
4782                         ASSERT(bp->b_un.b_addr == 0);
4783 
4784                         bp->b_edev = 0;
4785                         bp->b_dev = 0;
4786                         bp->b_lblkno = lbtodb(io_off);
4787                         bp->b_file = vp;
4788                         bp->b_offset = (offset_t)off;
4789                         bp_mapin(bp);
4790 
4791                         /*
4792                          * If doing a write beyond what we believe is EOF,
4793                          * don't bother trying to read the pages from the
4794                          * server, we'll just zero the pages here.  We
4795                          * don't check that the rw flag is S_WRITE here
4796                          * because some implementations may attempt a
4797                          * read access to the buffer before copying data.
4798                          */
4799                         mutex_enter(&rp->r_statelock);
4800                         if (io_off >= rp->r_size && seg == segkmap) {
4801                                 mutex_exit(&rp->r_statelock);
4802                                 bzero(bp->b_un.b_addr, io_len);
4803                         } else {
4804                                 mutex_exit(&rp->r_statelock);
4805                                 error = nfs3_bio(bp, NULL, cr);
4806                         }
4807 
4808                         /*
4809                          * Unmap the buffer before freeing it.
4810                          */
4811                         bp_mapout(bp);
4812                         pageio_done(bp);
4813 
4814                         savepp = pp;
4815                         do {
4816                                 pp->p_fsdata = C_NOCOMMIT;
4817                         } while ((pp = pp->p_next) != savepp);
4818 
4819                         if (error == NFS_EOF) {
4820                                 /*
4821                                  * If doing a write system call just return
4822                                  * zeroed pages, else user tried to get pages
4823                                  * beyond EOF, return error.  We don't check
4824                                  * that the rw flag is S_WRITE here because
4825                                  * some implementations may attempt a read
4826                                  * access to the buffer before copying data.
4827                                  */
4828                                 if (seg == segkmap)
4829                                         error = 0;
4830                                 else
4831                                         error = EFAULT;
4832                         }
4833 
4834                         if (!readahead_issued && !error) {
4835                                 mutex_enter(&rp->r_statelock);
4836                                 rp->r_nextr = io_off + io_len;
4837                                 mutex_exit(&rp->r_statelock);
4838                         }
4839                 }
4840         }
4841 
4842 out:
4843         if (pl == NULL)
4844                 return (error);
4845 
4846         if (error) {
4847                 if (pp != NULL)
4848                         pvn_read_done(pp, B_ERROR);
4849                 return (error);
4850         }
4851 
4852         if (pagefound) {
4853                 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
4854 
4855                 /*
4856                  * Page exists in the cache, acquire the appropriate lock.
4857                  * If this fails, start all over again.
4858                  */
4859                 if ((pp = page_lookup(vp, off, se)) == NULL) {
4860 #ifdef DEBUG
4861                         nfs3_lostpage++;
4862 #endif
4863                         goto reread;
4864                 }
4865                 pl[0] = pp;
4866                 pl[1] = NULL;
4867                 return (0);
4868         }
4869 
4870         if (pp != NULL)
4871                 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4872 
4873         return (error);
4874 }
4875 
4876 static void
4877 nfs3_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
4878         cred_t *cr)
4879 {
4880         int error;
4881         page_t *pp;
4882         u_offset_t io_off;
4883         size_t io_len;
4884         struct buf *bp;
4885         uint_t bsize, blksize;
4886         rnode_t *rp = VTOR(vp);
4887         page_t *savepp;
4888 
4889         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4890         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4891 
4892         mutex_enter(&rp->r_statelock);
4893         if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
4894                 /*
4895                  * If less than a block left in file read less
4896                  * than a block.
4897                  */
4898                 blksize = rp->r_size - blkoff;
4899         } else
4900                 blksize = bsize;
4901         mutex_exit(&rp->r_statelock);
4902 
4903         pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
4904             &io_off, &io_len, blkoff, blksize, 1);
4905         /*
4906          * The isra flag passed to the kluster function is 1, we may have
4907          * gotten a return value of NULL for a variety of reasons (# of free
4908          * pages < minfree, someone entered the page on the vnode etc). In all
4909          * cases, we want to punt on the readahead.
4910          */
4911         if (pp == NULL)
4912                 return;
4913 
4914         /*
4915          * Now round the request size up to page boundaries.
4916          * This ensures that the entire page will be
4917          * initialized to zeroes if EOF is encountered.
4918          */
4919         io_len = ptob(btopr(io_len));
4920 
4921         bp = pageio_setup(pp, io_len, vp, B_READ);
4922         ASSERT(bp != NULL);
4923 
4924         /*
4925          * pageio_setup should have set b_addr to 0.  This is correct since
4926          * we want to do I/O on a page boundary. bp_mapin() will use this addr
4927          * to calculate an offset, and then set b_addr to the kernel virtual
4928          * address it allocated for us.
4929          */
4930         ASSERT(bp->b_un.b_addr == 0);
4931 
4932         bp->b_edev = 0;
4933         bp->b_dev = 0;
4934         bp->b_lblkno = lbtodb(io_off);
4935         bp->b_file = vp;
4936         bp->b_offset = (offset_t)blkoff;
4937         bp_mapin(bp);
4938 
4939         /*
4940          * If doing a write beyond what we believe is EOF, don't bother trying
4941          * to read the pages from the server, we'll just zero the pages here.
4942          * We don't check that the rw flag is S_WRITE here because some
4943          * implementations may attempt a read access to the buffer before
4944          * copying data.
4945          */
4946         mutex_enter(&rp->r_statelock);
4947         if (io_off >= rp->r_size && seg == segkmap) {
4948                 mutex_exit(&rp->r_statelock);
4949                 bzero(bp->b_un.b_addr, io_len);
4950                 error = 0;
4951         } else {
4952                 mutex_exit(&rp->r_statelock);
4953                 error = nfs3_bio(bp, NULL, cr);
4954                 if (error == NFS_EOF)
4955                         error = 0;
4956         }
4957 
4958         /*
4959          * Unmap the buffer before freeing it.
4960          */
4961         bp_mapout(bp);
4962         pageio_done(bp);
4963 
4964         savepp = pp;
4965         do {
4966                 pp->p_fsdata = C_NOCOMMIT;
4967         } while ((pp = pp->p_next) != savepp);
4968 
4969         pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4970 
4971         /*
4972          * In case of error set readahead offset
4973          * to the lowest offset.
4974          * pvn_read_done() calls VN_DISPOSE to destroy the pages
4975          */
4976         if (error && rp->r_nextr > io_off) {
4977                 mutex_enter(&rp->r_statelock);
4978                 if (rp->r_nextr > io_off)
4979                         rp->r_nextr = io_off;
4980                 mutex_exit(&rp->r_statelock);
4981         }
4982 }
4983 
4984 /*
4985  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4986  * If len == 0, do from off to EOF.
4987  *
4988  * The normal cases should be len == 0 && off == 0 (entire vp list),
4989  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4990  * (from pageout).
4991  */
4992 /* ARGSUSED */
4993 static int
4994 nfs3_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4995         caller_context_t *ct)
4996 {
4997         int error;
4998         rnode_t *rp;
4999 
5000         ASSERT(cr != NULL);
5001 
5002         /*
5003          * XXX - Why should this check be made here?
5004          */
5005         if (vp->v_flag & VNOMAP)
5006                 return (ENOSYS);
5007         if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
5008                 return (0);
5009         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5010                 return (EIO);
5011 
5012         rp = VTOR(vp);
5013         mutex_enter(&rp->r_statelock);
5014         rp->r_count++;
5015         mutex_exit(&rp->r_statelock);
5016         error = nfs_putpages(vp, off, len, flags, cr);
5017         mutex_enter(&rp->r_statelock);
5018         rp->r_count--;
5019         cv_broadcast(&rp->r_cv);
5020         mutex_exit(&rp->r_statelock);
5021 
5022         return (error);
5023 }
5024 
5025 /*
5026  * Write out a single page, possibly klustering adjacent dirty pages.
5027  */
5028 int
5029 nfs3_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
5030         int flags, cred_t *cr)
5031 {
5032         u_offset_t io_off;
5033         u_offset_t lbn_off;
5034         u_offset_t lbn;
5035         size_t io_len;
5036         uint_t bsize;
5037         int error;
5038         rnode_t *rp;
5039 
5040         ASSERT(!vn_is_readonly(vp));
5041         ASSERT(pp != NULL);
5042         ASSERT(cr != NULL);
5043         ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
5044 
5045         rp = VTOR(vp);
5046         ASSERT(rp->r_count > 0);
5047 
5048         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
5049         lbn = pp->p_offset / bsize;
5050         lbn_off = lbn * bsize;
5051 
5052         /*
5053          * Find a kluster that fits in one block, or in
5054          * one page if pages are bigger than blocks.  If
5055          * there is less file space allocated than a whole
5056          * page, we'll shorten the i/o request below.
5057          */
5058         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
5059             roundup(bsize, PAGESIZE), flags);
5060 
5061         /*
5062          * pvn_write_kluster shouldn't have returned a page with offset
5063          * behind the original page we were given.  Verify that.
5064          */
5065         ASSERT((pp->p_offset / bsize) >= lbn);
5066 
5067         /*
5068          * Now pp will have the list of kept dirty pages marked for
5069          * write back.  It will also handle invalidation and freeing
5070          * of pages that are not dirty.  Check for page length rounding
5071          * problems.
5072          */
5073         if (io_off + io_len > lbn_off + bsize) {
5074                 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
5075                 io_len = lbn_off + bsize - io_off;
5076         }
5077         /*
5078          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5079          * consistent value of r_size. RMODINPROGRESS is set in writerp().
5080          * When RMODINPROGRESS is set it indicates that a uiomove() is in
5081          * progress and the r_size has not been made consistent with the
5082          * new size of the file. When the uiomove() completes the r_size is
5083          * updated and the RMODINPROGRESS flag is cleared.
5084          *
5085          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5086          * consistent value of r_size. Without this handshaking, it is
5087          * possible that nfs(3)_bio() picks  up the old value of r_size
5088          * before the uiomove() in writerp() completes. This will result
5089          * in the write through nfs(3)_bio() being dropped.
5090          *
5091          * More precisely, there is a window between the time the uiomove()
5092          * completes and the time the r_size is updated. If a VOP_PUTPAGE()
5093          * operation intervenes in this window, the page will be picked up,
5094          * because it is dirty (it will be unlocked, unless it was
5095          * pagecreate'd). When the page is picked up as dirty, the dirty
5096          * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
5097          * checked. This will still be the old size. Therefore the page will
5098          * not be written out. When segmap_release() calls VOP_PUTPAGE(),
5099          * the page will be found to be clean and the write will be dropped.
5100          */
5101         if (rp->r_flags & RMODINPROGRESS) {
5102                 mutex_enter(&rp->r_statelock);
5103                 if ((rp->r_flags & RMODINPROGRESS) &&
5104                     rp->r_modaddr + MAXBSIZE > io_off &&
5105                     rp->r_modaddr < io_off + io_len) {
5106                         page_t *plist;
5107                         /*
5108                          * A write is in progress for this region of the file.
5109                          * If we did not detect RMODINPROGRESS here then this
5110                          * path through nfs_putapage() would eventually go to
5111                          * nfs(3)_bio() and may not write out all of the data
5112                          * in the pages. We end up losing data. So we decide
5113                          * to set the modified bit on each page in the page
5114                          * list and mark the rnode with RDIRTY. This write
5115                          * will be restarted at some later time.
5116                          */
5117                         plist = pp;
5118                         while (plist != NULL) {
5119                                 pp = plist;
5120                                 page_sub(&plist, pp);
5121                                 hat_setmod(pp);
5122                                 page_io_unlock(pp);
5123                                 page_unlock(pp);
5124                         }
5125                         rp->r_flags |= RDIRTY;
5126                         mutex_exit(&rp->r_statelock);
5127                         if (offp)
5128                                 *offp = io_off;
5129                         if (lenp)
5130                                 *lenp = io_len;
5131                         return (0);
5132                 }
5133                 mutex_exit(&rp->r_statelock);
5134         }
5135 
5136         if (flags & B_ASYNC) {
5137                 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
5138                     nfs3_sync_putapage);
5139         } else
5140                 error = nfs3_sync_putapage(vp, pp, io_off, io_len, flags, cr);
5141 
5142         if (offp)
5143                 *offp = io_off;
5144         if (lenp)
5145                 *lenp = io_len;
5146         return (error);
5147 }
5148 
5149 static int
5150 nfs3_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5151         int flags, cred_t *cr)
5152 {
5153         int error;
5154         rnode_t *rp;
5155 
5156         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5157 
5158         flags |= B_WRITE;
5159 
5160         error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5161 
5162         rp = VTOR(vp);
5163 
5164         if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
5165             error == EACCES) &&
5166             (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
5167                 if (!(rp->r_flags & ROUTOFSPACE)) {
5168                         mutex_enter(&rp->r_statelock);
5169                         rp->r_flags |= ROUTOFSPACE;
5170                         mutex_exit(&rp->r_statelock);
5171                 }
5172                 flags |= B_ERROR;
5173                 pvn_write_done(pp, flags);
5174                 /*
5175                  * If this was not an async thread, then try again to
5176                  * write out the pages, but this time, also destroy
5177                  * them whether or not the write is successful.  This
5178                  * will prevent memory from filling up with these
5179                  * pages and destroying them is the only alternative
5180                  * if they can't be written out.
5181                  *
5182                  * Don't do this if this is an async thread because
5183                  * when the pages are unlocked in pvn_write_done,
5184                  * some other thread could have come along, locked
5185                  * them, and queued for an async thread.  It would be
5186                  * possible for all of the async threads to be tied
5187                  * up waiting to lock the pages again and they would
5188                  * all already be locked and waiting for an async
5189                  * thread to handle them.  Deadlock.
5190                  */
5191                 if (!(flags & B_ASYNC)) {
5192                         error = nfs3_putpage(vp, io_off, io_len,
5193                             B_INVAL | B_FORCE, cr, NULL);
5194                 }
5195         } else {
5196                 if (error)
5197                         flags |= B_ERROR;
5198                 else if (rp->r_flags & ROUTOFSPACE) {
5199                         mutex_enter(&rp->r_statelock);
5200                         rp->r_flags &= ~ROUTOFSPACE;
5201                         mutex_exit(&rp->r_statelock);
5202                 }
5203                 pvn_write_done(pp, flags);
5204                 if (freemem < desfree)
5205                         (void) nfs3_commit_vp(vp, (u_offset_t)0, 0, cr);
5206         }
5207 
5208         return (error);
5209 }
5210 
5211 /* ARGSUSED */
5212 static int
5213 nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
5214         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5215         cred_t *cr, caller_context_t *ct)
5216 {
5217         struct segvn_crargs vn_a;
5218         int error;
5219         rnode_t *rp;
5220         struct vattr va;
5221 
5222         if (nfs_zone() != VTOMI(vp)->mi_zone)
5223                 return (EIO);
5224 
5225         if (vp->v_flag & VNOMAP)
5226                 return (ENOSYS);
5227 
5228         if (off < 0 || off + len < 0)
5229                 return (ENXIO);
5230 
5231         if (vp->v_type != VREG)
5232                 return (ENODEV);
5233 
5234         /*
5235          * If there is cached data and if close-to-open consistency
5236          * checking is not turned off and if the file system is not
5237          * mounted readonly, then force an over the wire getattr.
5238          * Otherwise, just invoke nfs3getattr to get a copy of the
5239          * attributes.  The attribute cache will be used unless it
5240          * is timed out and if it is, then an over the wire getattr
5241          * will be issued.
5242          */
5243         va.va_mask = AT_ALL;
5244         if (vn_has_cached_data(vp) &&
5245             !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
5246                 error = nfs3_getattr_otw(vp, &va, cr);
5247         else
5248                 error = nfs3getattr(vp, &va, cr);
5249         if (error)
5250                 return (error);
5251 
5252         /*
5253          * Check to see if the vnode is currently marked as not cachable.
5254          * This means portions of the file are locked (through VOP_FRLOCK).
5255          * In this case the map request must be refused.  We use
5256          * rp->r_lkserlock to avoid a race with concurrent lock requests.
5257          */
5258         rp = VTOR(vp);
5259 
5260         /*
5261          * Atomically increment r_inmap after acquiring r_rwlock. The
5262          * idea here is to acquire r_rwlock to block read/write and
5263          * not to protect r_inmap. r_inmap will inform nfs3_read/write()
5264          * that we are in nfs3_map(). Now, r_rwlock is acquired in order
5265          * and we can prevent the deadlock that would have occurred
5266          * when nfs3_addmap() would have acquired it out of order.
5267          *
5268          * Since we are not protecting r_inmap by any lock, we do not
5269          * hold any lock when we decrement it. We atomically decrement
5270          * r_inmap after we release r_lkserlock.
5271          */
5272 
5273         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
5274                 return (EINTR);
5275         atomic_inc_uint(&rp->r_inmap);
5276         nfs_rw_exit(&rp->r_rwlock);
5277 
5278         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
5279                 atomic_dec_uint(&rp->r_inmap);
5280                 return (EINTR);
5281         }
5282 
5283         if (vp->v_flag & VNOCACHE) {
5284                 error = EAGAIN;
5285                 goto done;
5286         }
5287 
5288         /*
5289          * Don't allow concurrent locks and mapping if mandatory locking is
5290          * enabled.
5291          */
5292         if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
5293             MANDLOCK(vp, va.va_mode)) {
5294                 error = EAGAIN;
5295                 goto done;
5296         }
5297 
5298         as_rangelock(as);
5299         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5300         if (error != 0) {
5301                 as_rangeunlock(as);
5302                 goto done;
5303         }
5304 
5305         vn_a.vp = vp;
5306         vn_a.offset = off;
5307         vn_a.type = (flags & MAP_TYPE);
5308         vn_a.prot = (uchar_t)prot;
5309         vn_a.maxprot = (uchar_t)maxprot;
5310         vn_a.flags = (flags & ~MAP_TYPE);
5311         vn_a.cred = cr;
5312         vn_a.amp = NULL;
5313         vn_a.szc = 0;
5314         vn_a.lgrp_mem_policy_flags = 0;
5315 
5316         error = as_map(as, *addrp, len, segvn_create, &vn_a);
5317         as_rangeunlock(as);
5318 
5319 done:
5320         nfs_rw_exit(&rp->r_lkserlock);
5321         atomic_dec_uint(&rp->r_inmap);
5322         return (error);
5323 }
5324 
5325 /* ARGSUSED */
5326 static int
5327 nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5328         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5329         cred_t *cr, caller_context_t *ct)
5330 {
5331         rnode_t *rp;
5332 
5333         if (vp->v_flag & VNOMAP)
5334                 return (ENOSYS);
5335         if (nfs_zone() != VTOMI(vp)->mi_zone)
5336                 return (EIO);
5337 
5338         rp = VTOR(vp);
5339         atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
5340 
5341         return (0);
5342 }
5343 
5344 /* ARGSUSED */
5345 static int
5346 nfs3_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5347         offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
5348         caller_context_t *ct)
5349 {
5350         netobj lm_fh3;
5351         int rc;
5352         u_offset_t start, end;
5353         rnode_t *rp;
5354         int error = 0, intr = INTR(vp);
5355 
5356         if (nfs_zone() != VTOMI(vp)->mi_zone)
5357                 return (EIO);
5358         /* check for valid cmd parameter */
5359         if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
5360                 return (EINVAL);
5361 
5362         /* Verify l_type. */
5363         switch (bfp->l_type) {
5364         case F_RDLCK:
5365                 if (cmd != F_GETLK && !(flag & FREAD))
5366                         return (EBADF);
5367                 break;
5368         case F_WRLCK:
5369                 if (cmd != F_GETLK && !(flag & FWRITE))
5370                         return (EBADF);
5371                 break;
5372         case F_UNLCK:
5373                 intr = 0;
5374                 break;
5375 
5376         default:
5377                 return (EINVAL);
5378         }
5379 
5380         /* check the validity of the lock range */
5381         if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
5382                 return (rc);
5383         if (rc = flk_check_lock_data(start, end, MAXEND))
5384                 return (rc);
5385 
5386         /*
5387          * If the filesystem is mounted using local locking, pass the
5388          * request off to the local locking code.
5389          */
5390         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
5391                 if (cmd == F_SETLK || cmd == F_SETLKW) {
5392                         /*
5393                          * For complete safety, we should be holding
5394                          * r_lkserlock.  However, we can't call
5395                          * lm_safelock and then fs_frlock while
5396                          * holding r_lkserlock, so just invoke
5397                          * lm_safelock and expect that this will
5398                          * catch enough of the cases.
5399                          */
5400                         if (!lm_safelock(vp, bfp, cr))
5401                                 return (EAGAIN);
5402                 }
5403                 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
5404         }
5405 
5406         rp = VTOR(vp);
5407 
5408         /*
5409          * Check whether the given lock request can proceed, given the
5410          * current file mappings.
5411          */
5412         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
5413                 return (EINTR);
5414         if (cmd == F_SETLK || cmd == F_SETLKW) {
5415                 if (!lm_safelock(vp, bfp, cr)) {
5416                         rc = EAGAIN;
5417                         goto done;
5418                 }
5419         }
5420 
5421         /*
5422          * Flush the cache after waiting for async I/O to finish.  For new
5423          * locks, this is so that the process gets the latest bits from the
5424          * server.  For unlocks, this is so that other clients see the
5425          * latest bits once the file has been unlocked.  If currently dirty
5426          * pages can't be flushed, then don't allow a lock to be set.  But
5427          * allow unlocks to succeed, to avoid having orphan locks on the
5428          * server.
5429          */
5430         if (cmd != F_GETLK) {
5431                 mutex_enter(&rp->r_statelock);
5432                 while (rp->r_count > 0) {
5433                         if (intr) {
5434                                 klwp_t *lwp = ttolwp(curthread);
5435 
5436                                 if (lwp != NULL)
5437                                         lwp->lwp_nostop++;
5438                                 if (cv_wait_sig(&rp->r_cv,
5439                                     &rp->r_statelock) == 0) {
5440                                         if (lwp != NULL)
5441                                                 lwp->lwp_nostop--;
5442                                         rc = EINTR;
5443                                         break;
5444                                 }
5445                                 if (lwp != NULL)
5446                                         lwp->lwp_nostop--;
5447                         } else
5448                                 cv_wait(&rp->r_cv, &rp->r_statelock);
5449                 }
5450                 mutex_exit(&rp->r_statelock);
5451                 if (rc != 0)
5452                         goto done;
5453                 error = nfs3_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
5454                 if (error) {
5455                         if (error == ENOSPC || error == EDQUOT) {
5456                                 mutex_enter(&rp->r_statelock);
5457                                 if (!rp->r_error)
5458                                         rp->r_error = error;
5459                                 mutex_exit(&rp->r_statelock);
5460                         }
5461                         if (bfp->l_type != F_UNLCK) {
5462                                 rc = ENOLCK;
5463                                 goto done;
5464                         }
5465                 }
5466         }
5467 
5468         lm_fh3.n_len = VTOFH3(vp)->fh3_length;
5469         lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
5470 
5471         /*
5472          * Call the lock manager to do the real work of contacting
5473          * the server and obtaining the lock.
5474          */
5475         rc = lm4_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh3, flk_cbp);
5476 
5477         if (rc == 0)
5478                 nfs_lockcompletion(vp, cmd);
5479 
5480 done:
5481         nfs_rw_exit(&rp->r_lkserlock);
5482         return (rc);
5483 }
5484 
5485 /*
5486  * Free storage space associated with the specified vnode.  The portion
5487  * to be freed is specified by bfp->l_start and bfp->l_len (already
5488  * normalized to a "whence" of 0).
5489  *
5490  * This is an experimental facility whose continued existence is not
5491  * guaranteed.  Currently, we only support the special case
5492  * of l_len == 0, meaning free to end of file.
5493  */
5494 /* ARGSUSED */
5495 static int
5496 nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5497         offset_t offset, cred_t *cr, caller_context_t *ct)
5498 {
5499         int error;
5500 
5501         ASSERT(vp->v_type == VREG);
5502         if (cmd != F_FREESP)
5503                 return (EINVAL);
5504         if (nfs_zone() != VTOMI(vp)->mi_zone)
5505                 return (EIO);
5506 
5507         error = convoff(vp, bfp, 0, offset);
5508         if (!error) {
5509                 ASSERT(bfp->l_start >= 0);
5510                 if (bfp->l_len == 0) {
5511                         struct vattr va;
5512 
5513                         /*
5514                          * ftruncate should not change the ctime and
5515                          * mtime if we truncate the file to its
5516                          * previous size.
5517                          */
5518                         va.va_mask = AT_SIZE;
5519                         error = nfs3getattr(vp, &va, cr);
5520                         if (error || va.va_size == bfp->l_start)
5521                                 return (error);
5522                         va.va_mask = AT_SIZE;
5523                         va.va_size = bfp->l_start;
5524                         error = nfs3setattr(vp, &va, 0, cr);
5525 
5526                         if (error == 0 && bfp->l_start == 0)
5527                                 vnevent_truncate(vp, ct);
5528                 } else
5529                         error = EINVAL;
5530         }
5531 
5532         return (error);
5533 }
5534 
5535 /* ARGSUSED */
5536 static int
5537 nfs3_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
5538 {
5539 
5540         return (EINVAL);
5541 }
5542 
5543 /*
5544  * Setup and add an address space callback to do the work of the delmap call.
5545  * The callback will (and must be) deleted in the actual callback function.
5546  *
5547  * This is done in order to take care of the problem that we have with holding
5548  * the address space's a_lock for a long period of time (e.g. if the NFS server
5549  * is down).  Callbacks will be executed in the address space code while the
5550  * a_lock is not held.  Holding the address space's a_lock causes things such
5551  * as ps and fork to hang because they are trying to acquire this lock as well.
5552  */
5553 /* ARGSUSED */
5554 static int
5555 nfs3_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5556         size_t len, uint_t prot, uint_t maxprot, uint_t flags,
5557         cred_t *cr, caller_context_t *ct)
5558 {
5559         int                     caller_found;
5560         int                     error;
5561         rnode_t                 *rp;
5562         nfs_delmap_args_t       *dmapp;
5563         nfs_delmapcall_t        *delmap_call;
5564 
5565         if (vp->v_flag & VNOMAP)
5566                 return (ENOSYS);
5567         /*
5568          * A process may not change zones if it has NFS pages mmap'ed
5569          * in, so we can't legitimately get here from the wrong zone.
5570          */
5571         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5572 
5573         rp = VTOR(vp);
5574 
5575         /*
5576          * The way that the address space of this process deletes its mapping
5577          * of this file is via the following call chains:
5578          * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap()
5579          * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap()
5580          *
5581          * With the use of address space callbacks we are allowed to drop the
5582          * address space lock, a_lock, while executing the NFS operations that
5583          * need to go over the wire.  Returning EAGAIN to the caller of this
5584          * function is what drives the execution of the callback that we add
5585          * below.  The callback will be executed by the address space code
5586          * after dropping the a_lock.  When the callback is finished, since
5587          * we dropped the a_lock, it must be re-acquired and segvn_unmap()
5588          * is called again on the same segment to finish the rest of the work
5589          * that needs to happen during unmapping.
5590          *
5591          * This action of calling back into the segment driver causes
5592          * nfs3_delmap() to get called again, but since the callback was
5593          * already executed at this point, it already did the work and there
5594          * is nothing left for us to do.
5595          *
5596          * To Summarize:
5597          * - The first time nfs3_delmap is called by the current thread is when
5598          * we add the caller associated with this delmap to the delmap caller
5599          * list, add the callback, and return EAGAIN.
5600          * - The second time in this call chain when nfs3_delmap is called we
5601          * will find this caller in the delmap caller list and realize there
5602          * is no more work to do thus removing this caller from the list and
5603          * returning the error that was set in the callback execution.
5604          */
5605         caller_found = nfs_find_and_delete_delmapcall(rp, &error);
5606         if (caller_found) {
5607                 /*
5608                  * 'error' is from the actual delmap operations.  To avoid
5609                  * hangs, we need to handle the return of EAGAIN differently
5610                  * since this is what drives the callback execution.
5611                  * In this case, we don't want to return EAGAIN and do the
5612                  * callback execution because there are none to execute.
5613                  */
5614                 if (error == EAGAIN)
5615                         return (0);
5616                 else
5617                         return (error);
5618         }
5619 
5620         /* current caller was not in the list */
5621         delmap_call = nfs_init_delmapcall();
5622 
5623         mutex_enter(&rp->r_statelock);
5624         list_insert_tail(&rp->r_indelmap, delmap_call);
5625         mutex_exit(&rp->r_statelock);
5626 
5627         dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
5628 
5629         dmapp->vp = vp;
5630         dmapp->off = off;
5631         dmapp->addr = addr;
5632         dmapp->len = len;
5633         dmapp->prot = prot;
5634         dmapp->maxprot = maxprot;
5635         dmapp->flags = flags;
5636         dmapp->cr = cr;
5637         dmapp->caller = delmap_call;
5638 
5639         error = as_add_callback(as, nfs3_delmap_callback, dmapp,
5640             AS_UNMAP_EVENT, addr, len, KM_SLEEP);
5641 
5642         return (error ? error : EAGAIN);
5643 }
5644 
5645 /*
5646  * Remove some pages from an mmap'd vnode.  Just update the
5647  * count of pages.  If doing close-to-open, then flush and
5648  * commit all of the pages associated with this file.
5649  * Otherwise, start an asynchronous page flush to write out
5650  * any dirty pages.  This will also associate a credential
5651  * with the rnode which can be used to write the pages.
5652  */
5653 /* ARGSUSED */
5654 static void
5655 nfs3_delmap_callback(struct as *as, void *arg, uint_t event)
5656 {
5657         int                     error;
5658         rnode_t                 *rp;
5659         mntinfo_t               *mi;
5660         nfs_delmap_args_t       *dmapp = (nfs_delmap_args_t *)arg;
5661 
5662         rp = VTOR(dmapp->vp);
5663         mi = VTOMI(dmapp->vp);
5664 
5665         atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
5666         ASSERT(rp->r_mapcnt >= 0);
5667 
5668         /*
5669          * Initiate a page flush and potential commit if there are
5670          * pages, the file system was not mounted readonly, the segment
5671          * was mapped shared, and the pages themselves were writeable.
5672          */
5673         if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
5674             dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
5675                 mutex_enter(&rp->r_statelock);
5676                 rp->r_flags |= RDIRTY;
5677                 mutex_exit(&rp->r_statelock);
5678                 /*
5679                  * If this is a cross-zone access a sync putpage won't work, so
5680                  * the best we can do is try an async putpage.  That seems
5681                  * better than something more draconian such as discarding the
5682                  * dirty pages.
5683                  */
5684                 if ((mi->mi_flags & MI_NOCTO) ||
5685                     nfs_zone() != mi->mi_zone)
5686                         error = nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5687                             B_ASYNC, dmapp->cr, NULL);
5688                 else
5689                         error = nfs3_putpage_commit(dmapp->vp, dmapp->off,
5690                             dmapp->len, dmapp->cr);
5691                 if (!error) {
5692                         mutex_enter(&rp->r_statelock);
5693                         error = rp->r_error;
5694                         rp->r_error = 0;
5695                         mutex_exit(&rp->r_statelock);
5696                 }
5697         } else
5698                 error = 0;
5699 
5700         if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
5701                 (void) nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5702                     B_INVAL, dmapp->cr, NULL);
5703 
5704         dmapp->caller->error = error;
5705         (void) as_delete_callback(as, arg);
5706         kmem_free(dmapp, sizeof (nfs_delmap_args_t));
5707 }
5708 
5709 static int nfs3_pathconf_disable_cache = 0;
5710 
5711 #ifdef DEBUG
5712 static int nfs3_pathconf_cache_hits = 0;
5713 static int nfs3_pathconf_cache_misses = 0;
5714 #endif
5715 
5716 /* ARGSUSED */
5717 static int
5718 nfs3_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5719         caller_context_t *ct)
5720 {
5721         int error;
5722         PATHCONF3args args;
5723         PATHCONF3res res;
5724         int douprintf;
5725         failinfo_t fi;
5726         rnode_t *rp;
5727         hrtime_t t;
5728 
5729         if (nfs_zone() != VTOMI(vp)->mi_zone)
5730                 return (EIO);
5731         /*
5732          * Large file spec - need to base answer on info stored
5733          * on original FSINFO response.
5734          */
5735         if (cmd == _PC_FILESIZEBITS) {
5736                 unsigned long long ll;
5737                 long l = 1;
5738 
5739                 ll = VTOMI(vp)->mi_maxfilesize;
5740 
5741                 if (ll == 0) {
5742                         *valp = 0;
5743                         return (0);
5744                 }
5745 
5746                 if (ll & 0xffffffff00000000) {
5747                         l += 32; ll >>= 32;
5748                 }
5749                 if (ll & 0xffff0000) {
5750                         l += 16; ll >>= 16;
5751                 }
5752                 if (ll & 0xff00) {
5753                         l += 8; ll >>= 8;
5754                 }
5755                 if (ll & 0xf0) {
5756                         l += 4; ll >>= 4;
5757                 }
5758                 if (ll & 0xc) {
5759                         l += 2; ll >>= 2;
5760                 }
5761                 if (ll & 0x2)
5762                         l += 2;
5763                 else if (ll & 0x1)
5764                         l += 1;
5765                 *valp = l;
5766                 return (0);
5767         }
5768 
5769         if (cmd == _PC_ACL_ENABLED) {
5770                 *valp = _ACL_ACLENT_ENABLED;
5771                 return (0);
5772         }
5773 
5774         if (cmd == _PC_XATTR_EXISTS) {
5775                 error = 0;
5776                 *valp = 0;
5777                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5778                         vnode_t *avp;
5779                         rnode_t *rp;
5780                         int error = 0;
5781                         mntinfo_t *mi = VTOMI(vp);
5782 
5783                         if (!(mi->mi_flags & MI_EXTATTR))
5784                                 return (0);
5785 
5786                         rp = VTOR(vp);
5787                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
5788                             INTR(vp)))
5789                                 return (EINTR);
5790 
5791                         error = nfs3lookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
5792                         if (error || avp == NULL)
5793                                 error = acl_getxattrdir3(vp, &avp, 0, cr, 0);
5794 
5795                         nfs_rw_exit(&rp->r_rwlock);
5796 
5797                         if (error == 0 && avp != NULL) {
5798                                 error = do_xattr_exists_check(avp, valp, cr);
5799                                 VN_RELE(avp);
5800                         } else if (error == ENOENT) {
5801                                 error = 0;
5802                                 *valp = 0;
5803                         }
5804                 }
5805                 return (error);
5806         }
5807 
5808         rp = VTOR(vp);
5809         if (rp->r_pathconf != NULL) {
5810                 mutex_enter(&rp->r_statelock);
5811                 if (rp->r_pathconf != NULL && nfs3_pathconf_disable_cache) {
5812                         kmem_free(rp->r_pathconf, sizeof (*rp->r_pathconf));
5813                         rp->r_pathconf = NULL;
5814                 }
5815                 if (rp->r_pathconf != NULL) {
5816                         error = 0;
5817                         switch (cmd) {
5818                         case _PC_LINK_MAX:
5819                                 *valp = rp->r_pathconf->link_max;
5820                                 break;
5821                         case _PC_NAME_MAX:
5822                                 *valp = rp->r_pathconf->name_max;
5823                                 break;
5824                         case _PC_PATH_MAX:
5825                         case _PC_SYMLINK_MAX:
5826                                 *valp = MAXPATHLEN;
5827                                 break;
5828                         case _PC_CHOWN_RESTRICTED:
5829                                 *valp = rp->r_pathconf->chown_restricted;
5830                                 break;
5831                         case _PC_NO_TRUNC:
5832                                 *valp = rp->r_pathconf->no_trunc;
5833                                 break;
5834                         default:
5835                                 error = EINVAL;
5836                                 break;
5837                         }
5838                         mutex_exit(&rp->r_statelock);
5839 #ifdef DEBUG
5840                         nfs3_pathconf_cache_hits++;
5841 #endif
5842                         return (error);
5843                 }
5844                 mutex_exit(&rp->r_statelock);
5845         }
5846 #ifdef DEBUG
5847         nfs3_pathconf_cache_misses++;
5848 #endif
5849 
5850         args.object = *VTOFH3(vp);
5851         fi.vp = vp;
5852         fi.fhp = (caddr_t)&args.object;
5853         fi.copyproc = nfs3copyfh;
5854         fi.lookupproc = nfs3lookup;
5855         fi.xattrdirproc = acl_getxattrdir3;
5856 
5857         douprintf = 1;
5858 
5859         t = gethrtime();
5860 
5861         error = rfs3call(VTOMI(vp), NFSPROC3_PATHCONF,
5862             xdr_nfs_fh3, (caddr_t)&args,
5863             xdr_PATHCONF3res, (caddr_t)&res, cr,
5864             &douprintf, &res.status, 0, &fi);
5865 
5866         if (error)
5867                 return (error);
5868 
5869         error = geterrno3(res.status);
5870 
5871         if (!error) {
5872                 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
5873                 if (!nfs3_pathconf_disable_cache) {
5874                         mutex_enter(&rp->r_statelock);
5875                         if (rp->r_pathconf == NULL) {
5876                                 rp->r_pathconf = kmem_alloc(
5877                                     sizeof (*rp->r_pathconf), KM_NOSLEEP);
5878                                 if (rp->r_pathconf != NULL)
5879                                         *rp->r_pathconf = res.resok.info;
5880                         }
5881                         mutex_exit(&rp->r_statelock);
5882                 }
5883                 switch (cmd) {
5884                 case _PC_LINK_MAX:
5885                         *valp = res.resok.info.link_max;
5886                         break;
5887                 case _PC_NAME_MAX:
5888                         *valp = res.resok.info.name_max;
5889                         break;
5890                 case _PC_PATH_MAX:
5891                 case _PC_SYMLINK_MAX:
5892                         *valp = MAXPATHLEN;
5893                         break;
5894                 case _PC_CHOWN_RESTRICTED:
5895                         *valp = res.resok.info.chown_restricted;
5896                         break;
5897                 case _PC_NO_TRUNC:
5898                         *valp = res.resok.info.no_trunc;
5899                         break;
5900                 default:
5901                         return (EINVAL);
5902                 }
5903         } else {
5904                 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
5905                 PURGE_STALE_FH(error, vp, cr);
5906         }
5907 
5908         return (error);
5909 }
5910 
5911 /*
5912  * Called by async thread to do synchronous pageio. Do the i/o, wait
5913  * for it to complete, and cleanup the page list when done.
5914  */
5915 static int
5916 nfs3_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5917         int flags, cred_t *cr)
5918 {
5919         int error;
5920 
5921         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5922         error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5923         if (flags & B_READ)
5924                 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
5925         else
5926                 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
5927         return (error);
5928 }
5929 
5930 /* ARGSUSED */
5931 static int
5932 nfs3_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5933         int flags, cred_t *cr, caller_context_t *ct)
5934 {
5935         int error;
5936         rnode_t *rp;
5937 
5938         if (pp == NULL)
5939                 return (EINVAL);
5940         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5941                 return (EIO);
5942 
5943         rp = VTOR(vp);
5944         mutex_enter(&rp->r_statelock);
5945         rp->r_count++;
5946         mutex_exit(&rp->r_statelock);
5947 
5948         if (flags & B_ASYNC) {
5949                 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
5950                     nfs3_sync_pageio);
5951         } else
5952                 error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5953         mutex_enter(&rp->r_statelock);
5954         rp->r_count--;
5955         cv_broadcast(&rp->r_cv);
5956         mutex_exit(&rp->r_statelock);
5957         return (error);
5958 }
5959 
5960 /* ARGSUSED */
5961 static void
5962 nfs3_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
5963         caller_context_t *ct)
5964 {
5965         int error;
5966         rnode_t *rp;
5967         page_t *plist;
5968         page_t *pptr;
5969         offset3 offset;
5970         count3 len;
5971         k_sigset_t smask;
5972 
5973         /*
5974          * We should get called with fl equal to either B_FREE or
5975          * B_INVAL.  Any other value is illegal.
5976          *
5977          * The page that we are either supposed to free or destroy
5978          * should be exclusive locked and its io lock should not
5979          * be held.
5980          */
5981         ASSERT(fl == B_FREE || fl == B_INVAL);
5982         ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
5983         rp = VTOR(vp);
5984 
5985         /*
5986          * If the page doesn't need to be committed or we shouldn't
5987          * even bother attempting to commit it, then just make sure
5988          * that the p_fsdata byte is clear and then either free or
5989          * destroy the page as appropriate.
5990          */
5991         if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & RSTALE)) {
5992                 pp->p_fsdata = C_NOCOMMIT;
5993                 if (fl == B_FREE)
5994                         page_free(pp, dn);
5995                 else
5996                         page_destroy(pp, dn);
5997                 return;
5998         }
5999 
6000         /*
6001          * If there is a page invalidation operation going on, then
6002          * if this is one of the pages being destroyed, then just
6003          * clear the p_fsdata byte and then either free or destroy
6004          * the page as appropriate.
6005          */
6006         mutex_enter(&rp->r_statelock);
6007         if ((rp->r_flags & RTRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
6008                 mutex_exit(&rp->r_statelock);
6009                 pp->p_fsdata = C_NOCOMMIT;
6010                 if (fl == B_FREE)
6011                         page_free(pp, dn);
6012                 else
6013                         page_destroy(pp, dn);
6014                 return;
6015         }
6016 
6017         /*
6018          * If we are freeing this page and someone else is already
6019          * waiting to do a commit, then just unlock the page and
6020          * return.  That other thread will take care of commiting
6021          * this page.  The page can be freed sometime after the
6022          * commit has finished.  Otherwise, if the page is marked
6023          * as delay commit, then we may be getting called from
6024          * pvn_write_done, one page at a time.   This could result
6025          * in one commit per page, so we end up doing lots of small
6026          * commits instead of fewer larger commits.  This is bad,
6027          * we want do as few commits as possible.
6028          */
6029         if (fl == B_FREE) {
6030                 if (rp->r_flags & RCOMMITWAIT) {
6031                         page_unlock(pp);
6032                         mutex_exit(&rp->r_statelock);
6033                         return;
6034                 }
6035                 if (pp->p_fsdata == C_DELAYCOMMIT) {
6036                         pp->p_fsdata = C_COMMIT;
6037                         page_unlock(pp);
6038                         mutex_exit(&rp->r_statelock);
6039                         return;
6040                 }
6041         }
6042 
6043         /*
6044          * Check to see if there is a signal which would prevent an
6045          * attempt to commit the pages from being successful.  If so,
6046          * then don't bother with all of the work to gather pages and
6047          * generate the unsuccessful RPC.  Just return from here and
6048          * let the page be committed at some later time.
6049          */
6050         sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
6051         if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
6052                 sigunintr(&smask);
6053                 page_unlock(pp);
6054                 mutex_exit(&rp->r_statelock);
6055                 return;
6056         }
6057         sigunintr(&smask);
6058 
6059         /*
6060          * We are starting to need to commit pages, so let's try
6061          * to commit as many as possible at once to reduce the
6062          * overhead.
6063          *
6064          * Set the `commit inprogress' state bit.  We must
6065          * first wait until any current one finishes.  Then
6066          * we initialize the c_pages list with this page.
6067          */
6068         while (rp->r_flags & RCOMMIT) {
6069                 rp->r_flags |= RCOMMITWAIT;
6070                 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6071                 rp->r_flags &= ~RCOMMITWAIT;
6072         }
6073         rp->r_flags |= RCOMMIT;
6074         mutex_exit(&rp->r_statelock);
6075         ASSERT(rp->r_commit.c_pages == NULL);
6076         rp->r_commit.c_pages = pp;
6077         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6078         rp->r_commit.c_commlen = PAGESIZE;
6079 
6080         /*
6081          * Gather together all other pages which can be committed.
6082          * They will all be chained off r_commit.c_pages.
6083          */
6084         nfs3_get_commit(vp);
6085 
6086         /*
6087          * Clear the `commit inprogress' status and disconnect
6088          * the list of pages to be committed from the rnode.
6089          * At this same time, we also save the starting offset
6090          * and length of data to be committed on the server.
6091          */
6092         plist = rp->r_commit.c_pages;
6093         rp->r_commit.c_pages = NULL;
6094         offset = rp->r_commit.c_commbase;
6095         len = rp->r_commit.c_commlen;
6096         mutex_enter(&rp->r_statelock);
6097         rp->r_flags &= ~RCOMMIT;
6098         cv_broadcast(&rp->r_commit.c_cv);
6099         mutex_exit(&rp->r_statelock);
6100 
6101         if (curproc == proc_pageout || curproc == proc_fsflush ||
6102             nfs_zone() != VTOMI(vp)->mi_zone) {
6103                 nfs_async_commit(vp, plist, offset, len, cr, nfs3_async_commit);
6104                 return;
6105         }
6106 
6107         /*
6108          * Actually generate the COMMIT3 over the wire operation.
6109          */
6110         error = nfs3_commit(vp, offset, len, cr);
6111 
6112         /*
6113          * If we got an error during the commit, just unlock all
6114          * of the pages.  The pages will get retransmitted to the
6115          * server during a putpage operation.
6116          */
6117         if (error) {
6118                 while (plist != NULL) {
6119                         pptr = plist;
6120                         page_sub(&plist, pptr);
6121                         page_unlock(pptr);
6122                 }
6123                 return;
6124         }
6125 
6126         /*
6127          * We've tried as hard as we can to commit the data to stable
6128          * storage on the server.  We release the rest of the pages
6129          * and clear the commit required state.  They will be put
6130          * onto the tail of the cachelist if they are nolonger
6131          * mapped.
6132          */
6133         while (plist != pp) {
6134                 pptr = plist;
6135                 page_sub(&plist, pptr);
6136                 pptr->p_fsdata = C_NOCOMMIT;
6137                 (void) page_release(pptr, 1);
6138         }
6139 
6140         /*
6141          * It is possible that nfs3_commit didn't return error but
6142          * some other thread has modified the page we are going
6143          * to free/destroy.
6144          *    In this case we need to rewrite the page. Do an explicit check
6145          * before attempting to free/destroy the page. If modified, needs to
6146          * be rewritten so unlock the page and return.
6147          */
6148         if (hat_ismod(pp)) {
6149                 pp->p_fsdata = C_NOCOMMIT;
6150                 page_unlock(pp);
6151                 return;
6152         }
6153 
6154         /*
6155          * Now, as appropriate, either free or destroy the page
6156          * that we were called with.
6157          */
6158         pp->p_fsdata = C_NOCOMMIT;
6159         if (fl == B_FREE)
6160                 page_free(pp, dn);
6161         else
6162                 page_destroy(pp, dn);
6163 }
6164 
6165 static int
6166 nfs3_commit(vnode_t *vp, offset3 offset, count3 count, cred_t *cr)
6167 {
6168         int error;
6169         rnode_t *rp;
6170         COMMIT3args args;
6171         COMMIT3res res;
6172         int douprintf;
6173         cred_t *cred;
6174 
6175         rp = VTOR(vp);
6176         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6177 
6178         mutex_enter(&rp->r_statelock);
6179         if (rp->r_cred != NULL) {
6180                 cred = rp->r_cred;
6181                 crhold(cred);
6182         } else {
6183                 rp->r_cred = cr;
6184                 crhold(cr);
6185                 cred = cr;
6186                 crhold(cred);
6187         }
6188         mutex_exit(&rp->r_statelock);
6189 
6190         args.file = *VTOFH3(vp);
6191         args.offset = offset;
6192         args.count = count;
6193 
6194 doitagain:
6195         douprintf = 1;
6196         error = rfs3call(VTOMI(vp), NFSPROC3_COMMIT,
6197             xdr_COMMIT3args, (caddr_t)&args,
6198             xdr_COMMIT3res, (caddr_t)&res, cred,
6199             &douprintf, &res.status, 0, NULL);
6200 
6201         crfree(cred);
6202 
6203         if (error)
6204                 return (error);
6205 
6206         error = geterrno3(res.status);
6207         if (!error) {
6208                 ASSERT(rp->r_flags & RHAVEVERF);
6209                 mutex_enter(&rp->r_statelock);
6210                 if (rp->r_verf == res.resok.verf) {
6211                         mutex_exit(&rp->r_statelock);
6212                         return (0);
6213                 }
6214                 nfs3_set_mod(vp);
6215                 rp->r_verf = res.resok.verf;
6216                 mutex_exit(&rp->r_statelock);
6217                 error = NFS_VERF_MISMATCH;
6218         } else {
6219                 if (error == EACCES) {
6220                         mutex_enter(&rp->r_statelock);
6221                         if (cred != cr) {
6222                                 if (rp->r_cred != NULL)
6223                                         crfree(rp->r_cred);
6224                                 rp->r_cred = cr;
6225                                 crhold(cr);
6226                                 cred = cr;
6227                                 crhold(cred);
6228                                 mutex_exit(&rp->r_statelock);
6229                                 goto doitagain;
6230                         }
6231                         mutex_exit(&rp->r_statelock);
6232                 }
6233                 /*
6234                  * Can't do a PURGE_STALE_FH here because this
6235                  * can cause a deadlock.  nfs3_commit can
6236                  * be called from nfs3_dispose which can be called
6237                  * indirectly via pvn_vplist_dirty.  PURGE_STALE_FH
6238                  * can call back to pvn_vplist_dirty.
6239                  */
6240                 if (error == ESTALE) {
6241                         mutex_enter(&rp->r_statelock);
6242                         rp->r_flags |= RSTALE;
6243                         if (!rp->r_error)
6244                                 rp->r_error = error;
6245                         mutex_exit(&rp->r_statelock);
6246                         PURGE_ATTRCACHE(vp);
6247                 } else {
6248                         mutex_enter(&rp->r_statelock);
6249                         if (!rp->r_error)
6250                                 rp->r_error = error;
6251                         mutex_exit(&rp->r_statelock);
6252                 }
6253         }
6254 
6255         return (error);
6256 }
6257 
6258 static void
6259 nfs3_set_mod(vnode_t *vp)
6260 {
6261         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6262 
6263         pvn_vplist_setdirty(vp, nfs_setmod_check);
6264 }
6265 
6266 /*
6267  * This routine is used to gather together a page list of the pages
6268  * which are to be committed on the server.  This routine must not
6269  * be called if the calling thread holds any locked pages.
6270  *
6271  * The calling thread must have set RCOMMIT.  This bit is used to
6272  * serialize access to the commit structure in the rnode.  As long
6273  * as the thread has set RCOMMIT, then it can manipulate the commit
6274  * structure without requiring any other locks.
6275  */
6276 static void
6277 nfs3_get_commit(vnode_t *vp)
6278 {
6279         rnode_t *rp;
6280         page_t *pp;
6281         kmutex_t *vphm;
6282 
6283         rp = VTOR(vp);
6284 
6285         ASSERT(rp->r_flags & RCOMMIT);
6286 
6287         vphm = page_vnode_mutex(vp);
6288         mutex_enter(vphm);
6289 
6290         /*
6291          * If there are no pages associated with this vnode, then
6292          * just return.
6293          */
6294         if ((pp = vp->v_pages) == NULL) {
6295                 mutex_exit(vphm);
6296                 return;
6297         }
6298 
6299         /*
6300          * Step through all of the pages associated with this vnode
6301          * looking for pages which need to be committed.
6302          */
6303         do {
6304                 /* Skip marker pages. */
6305                 if (pp->p_hash == PVN_VPLIST_HASH_TAG)
6306                         continue;
6307 
6308                 /*
6309                  * If this page does not need to be committed or is
6310                  * modified, then just skip it.
6311                  */
6312                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
6313                         continue;
6314 
6315                 /*
6316                  * Attempt to lock the page.  If we can't, then
6317                  * someone else is messing with it and we will
6318                  * just skip it.
6319                  */
6320                 if (!page_trylock(pp, SE_EXCL))
6321                         continue;
6322 
6323                 /*
6324                  * If this page does not need to be committed or is
6325                  * modified, then just skip it.  Recheck now that
6326                  * the page is locked.
6327                  */
6328                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6329                         page_unlock(pp);
6330                         continue;
6331                 }
6332 
6333                 if (PP_ISFREE(pp)) {
6334                         cmn_err(CE_PANIC, "nfs3_get_commit: %p is free",
6335                             (void *)pp);
6336                 }
6337 
6338                 /*
6339                  * The page needs to be committed and we locked it.
6340                  * Update the base and length parameters and add it
6341                  * to r_pages.
6342                  */
6343                 if (rp->r_commit.c_pages == NULL) {
6344                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6345                         rp->r_commit.c_commlen = PAGESIZE;
6346                 } else if (pp->p_offset < rp->r_commit.c_commbase) {
6347                         rp->r_commit.c_commlen = rp->r_commit.c_commbase -
6348                             (offset3)pp->p_offset + rp->r_commit.c_commlen;
6349                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6350                 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
6351                     <= pp->p_offset) {
6352                         rp->r_commit.c_commlen = (offset3)pp->p_offset -
6353                             rp->r_commit.c_commbase + PAGESIZE;
6354                 }
6355                 page_add(&rp->r_commit.c_pages, pp);
6356         } while ((pp = pp->p_vpnext) != vp->v_pages);
6357 
6358         mutex_exit(vphm);
6359 }
6360 
6361 /*
6362  * This routine is used to gather together a page list of the pages
6363  * which are to be committed on the server.  This routine must not
6364  * be called if the calling thread holds any locked pages.
6365  *
6366  * The calling thread must have set RCOMMIT.  This bit is used to
6367  * serialize access to the commit structure in the rnode.  As long
6368  * as the thread has set RCOMMIT, then it can manipulate the commit
6369  * structure without requiring any other locks.
6370  */
6371 static void
6372 nfs3_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
6373 {
6374 
6375         rnode_t *rp;
6376         page_t *pp;
6377         u_offset_t end;
6378         u_offset_t off;
6379 
6380         ASSERT(len != 0);
6381 
6382         rp = VTOR(vp);
6383 
6384         ASSERT(rp->r_flags & RCOMMIT);
6385         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6386 
6387         /*
6388          * If there are no pages associated with this vnode, then
6389          * just return.
6390          */
6391         if ((pp = vp->v_pages) == NULL)
6392                 return;
6393 
6394         /*
6395          * Calculate the ending offset.
6396          */
6397         end = soff + len;
6398 
6399         for (off = soff; off < end; off += PAGESIZE) {
6400                 /*
6401                  * Lookup each page by vp, offset.
6402                  */
6403                 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
6404                         continue;
6405 
6406                 /*
6407                  * If this page does not need to be committed or is
6408                  * modified, then just skip it.
6409                  */
6410                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6411                         page_unlock(pp);
6412                         continue;
6413                 }
6414 
6415                 ASSERT(PP_ISFREE(pp) == 0);
6416 
6417                 /*
6418                  * The page needs to be committed and we locked it.
6419                  * Update the base and length parameters and add it
6420                  * to r_pages.
6421                  */
6422                 if (rp->r_commit.c_pages == NULL) {
6423                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6424                         rp->r_commit.c_commlen = PAGESIZE;
6425                 } else {
6426                         rp->r_commit.c_commlen = (offset3)pp->p_offset -
6427                             rp->r_commit.c_commbase + PAGESIZE;
6428                 }
6429                 page_add(&rp->r_commit.c_pages, pp);
6430         }
6431 }
6432 
6433 static int
6434 nfs3_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
6435 {
6436         int error;
6437         writeverf3 write_verf;
6438         rnode_t *rp = VTOR(vp);
6439 
6440         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6441         /*
6442          * Flush the data portion of the file and then commit any
6443          * portions which need to be committed.  This may need to
6444          * be done twice if the server has changed state since
6445          * data was last written.  The data will need to be
6446          * rewritten to the server and then a new commit done.
6447          *
6448          * In fact, this may need to be done several times if the
6449          * server is having problems and crashing while we are
6450          * attempting to do this.
6451          */
6452 
6453 top:
6454         /*
6455          * Do a flush based on the poff and plen arguments.  This
6456          * will asynchronously write out any modified pages in the
6457          * range specified by (poff, plen).  This starts all of the
6458          * i/o operations which will be waited for in the next
6459          * call to nfs3_putpage
6460          */
6461 
6462         mutex_enter(&rp->r_statelock);
6463         write_verf = rp->r_verf;
6464         mutex_exit(&rp->r_statelock);
6465 
6466         error = nfs3_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
6467         if (error == EAGAIN)
6468                 error = 0;
6469 
6470         /*
6471          * Do a flush based on the poff and plen arguments.  This
6472          * will synchronously write out any modified pages in the
6473          * range specified by (poff, plen) and wait until all of
6474          * the asynchronous i/o's in that range are done as well.
6475          */
6476         if (!error)
6477                 error = nfs3_putpage(vp, poff, plen, 0, cr, NULL);
6478 
6479         if (error)
6480                 return (error);
6481 
6482         mutex_enter(&rp->r_statelock);
6483         if (rp->r_verf != write_verf) {
6484                 mutex_exit(&rp->r_statelock);
6485                 goto top;
6486         }
6487         mutex_exit(&rp->r_statelock);
6488 
6489         /*
6490          * Now commit any pages which might need to be committed.
6491          * If the error, NFS_VERF_MISMATCH, is returned, then
6492          * start over with the flush operation.
6493          */
6494 
6495         error = nfs3_commit_vp(vp, poff, plen, cr);
6496 
6497         if (error == NFS_VERF_MISMATCH)
6498                 goto top;
6499 
6500         return (error);
6501 }
6502 
6503 static int
6504 nfs3_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, cred_t *cr)
6505 {
6506         rnode_t *rp;
6507         page_t *plist;
6508         offset3 offset;
6509         count3 len;
6510 
6511 
6512         rp = VTOR(vp);
6513 
6514         if (nfs_zone() != VTOMI(vp)->mi_zone)
6515                 return (EIO);
6516         /*
6517          * Set the `commit inprogress' state bit.  We must
6518          * first wait until any current one finishes.
6519          */
6520         mutex_enter(&rp->r_statelock);
6521         while (rp->r_flags & RCOMMIT) {
6522                 rp->r_flags |= RCOMMITWAIT;
6523                 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6524                 rp->r_flags &= ~RCOMMITWAIT;
6525         }
6526         rp->r_flags |= RCOMMIT;
6527         mutex_exit(&rp->r_statelock);
6528 
6529         /*
6530          * Gather together all of the pages which need to be
6531          * committed.
6532          */
6533         if (plen == 0)
6534                 nfs3_get_commit(vp);
6535         else
6536                 nfs3_get_commit_range(vp, poff, plen);
6537 
6538         /*
6539          * Clear the `commit inprogress' bit and disconnect the
6540          * page list which was gathered together in nfs3_get_commit.
6541          */
6542         plist = rp->r_commit.c_pages;
6543         rp->r_commit.c_pages = NULL;
6544         offset = rp->r_commit.c_commbase;
6545         len = rp->r_commit.c_commlen;
6546         mutex_enter(&rp->r_statelock);
6547         rp->r_flags &= ~RCOMMIT;
6548         cv_broadcast(&rp->r_commit.c_cv);
6549         mutex_exit(&rp->r_statelock);
6550 
6551         /*
6552          * If any pages need to be committed, commit them and
6553          * then unlock them so that they can be freed some
6554          * time later.
6555          */
6556         if (plist != NULL) {
6557                 /*
6558                  * No error occurred during the flush portion
6559                  * of this operation, so now attempt to commit
6560                  * the data to stable storage on the server.
6561                  *
6562                  * This will unlock all of the pages on the list.
6563                  */
6564                 return (nfs3_sync_commit(vp, plist, offset, len, cr));
6565         }
6566         return (0);
6567 }
6568 
6569 static int
6570 nfs3_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6571         cred_t *cr)
6572 {
6573         int error;
6574         page_t *pp;
6575 
6576         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6577         error = nfs3_commit(vp, offset, count, cr);
6578 
6579         /*
6580          * If we got an error, then just unlock all of the pages
6581          * on the list.
6582          */
6583         if (error) {
6584                 while (plist != NULL) {
6585                         pp = plist;
6586                         page_sub(&plist, pp);
6587                         page_unlock(pp);
6588                 }
6589                 return (error);
6590         }
6591         /*
6592          * We've tried as hard as we can to commit the data to stable
6593          * storage on the server.  We just unlock the pages and clear
6594          * the commit required state.  They will get freed later.
6595          */
6596         while (plist != NULL) {
6597                 pp = plist;
6598                 page_sub(&plist, pp);
6599                 pp->p_fsdata = C_NOCOMMIT;
6600                 page_unlock(pp);
6601         }
6602 
6603         return (error);
6604 }
6605 
6606 static void
6607 nfs3_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6608         cred_t *cr)
6609 {
6610         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6611         (void) nfs3_sync_commit(vp, plist, offset, count, cr);
6612 }
6613 
6614 /* ARGSUSED */
6615 static int
6616 nfs3_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6617         caller_context_t *ct)
6618 {
6619         int error;
6620         mntinfo_t *mi;
6621 
6622         mi = VTOMI(vp);
6623 
6624         if (nfs_zone() != mi->mi_zone)
6625                 return (EIO);
6626 
6627         if (mi->mi_flags & MI_ACL) {
6628                 error = acl_setacl3(vp, vsecattr, flag, cr);
6629                 if (mi->mi_flags & MI_ACL)
6630                         return (error);
6631         }
6632 
6633         return (ENOSYS);
6634 }
6635 
6636 /* ARGSUSED */
6637 static int
6638 nfs3_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6639         caller_context_t *ct)
6640 {
6641         int error;
6642         mntinfo_t *mi;
6643 
6644         mi = VTOMI(vp);
6645 
6646         if (nfs_zone() != mi->mi_zone)
6647                 return (EIO);
6648 
6649         if (mi->mi_flags & MI_ACL) {
6650                 error = acl_getacl3(vp, vsecattr, flag, cr);
6651                 if (mi->mi_flags & MI_ACL)
6652                         return (error);
6653         }
6654 
6655         return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
6656 }
6657 
6658 /* ARGSUSED */
6659 static int
6660 nfs3_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
6661         caller_context_t *ct)
6662 {
6663         int error;
6664         struct shrlock nshr;
6665         struct nfs_owner nfs_owner;
6666         netobj lm_fh3;
6667 
6668         if (nfs_zone() != VTOMI(vp)->mi_zone)
6669                 return (EIO);
6670 
6671         /*
6672          * check for valid cmd parameter
6673          */
6674         if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
6675                 return (EINVAL);
6676 
6677         /*
6678          * Check access permissions
6679          */
6680         if (cmd == F_SHARE &&
6681             (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
6682             ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
6683                 return (EBADF);
6684 
6685         /*
6686          * If the filesystem is mounted using local locking, pass the
6687          * request off to the local share code.
6688          */
6689         if (VTOMI(vp)->mi_flags & MI_LLOCK)
6690                 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
6691 
6692         switch (cmd) {
6693         case F_SHARE:
6694         case F_UNSHARE:
6695                 lm_fh3.n_len = VTOFH3(vp)->fh3_length;
6696                 lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
6697 
6698                 /*
6699                  * If passed an owner that is too large to fit in an
6700                  * nfs_owner it is likely a recursive call from the
6701                  * lock manager client and pass it straight through.  If
6702                  * it is not a nfs_owner then simply return an error.
6703                  */
6704                 if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
6705                         if (((struct nfs_owner *)shr->s_owner)->magic !=
6706                             NFS_OWNER_MAGIC)
6707                                 return (EINVAL);
6708 
6709                         if (error = lm4_shrlock(vp, cmd, shr, flag, &lm_fh3)) {
6710                                 error = set_errno(error);
6711                         }
6712                         return (error);
6713                 }
6714                 /*
6715                  * Remote share reservations owner is a combination of
6716                  * a magic number, hostname, and the local owner
6717                  */
6718                 bzero(&nfs_owner, sizeof (nfs_owner));
6719                 nfs_owner.magic = NFS_OWNER_MAGIC;
6720                 (void) strncpy(nfs_owner.hname, uts_nodename(),
6721                     sizeof (nfs_owner.hname));
6722                 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
6723                 nshr.s_access = shr->s_access;
6724                 nshr.s_deny = shr->s_deny;
6725                 nshr.s_sysid = 0;
6726                 nshr.s_pid = ttoproc(curthread)->p_pid;
6727                 nshr.s_own_len = sizeof (nfs_owner);
6728                 nshr.s_owner = (caddr_t)&nfs_owner;
6729 
6730                 if (error = lm4_shrlock(vp, cmd, &nshr, flag, &lm_fh3)) {
6731                         error = set_errno(error);
6732                 }
6733 
6734                 break;
6735 
6736         case F_HASREMOTELOCKS:
6737                 /*
6738                  * NFS client can't store remote locks itself
6739                  */
6740                 shr->s_access = 0;
6741                 error = 0;
6742                 break;
6743 
6744         default:
6745                 error = EINVAL;
6746                 break;
6747         }
6748 
6749         return (error);
6750 }