1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28  *      All rights reserved.
  29  */
  30 
  31 #include <sys/param.h>
  32 #include <sys/types.h>
  33 #include <sys/systm.h>
  34 #include <sys/cred.h>
  35 #include <sys/buf.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vnode.h>
  38 #include <sys/uio.h>
  39 #include <sys/stat.h>
  40 #include <sys/errno.h>
  41 #include <sys/sysmacros.h>
  42 #include <sys/statvfs.h>
  43 #include <sys/kmem.h>
  44 #include <sys/kstat.h>
  45 #include <sys/dirent.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/debug.h>
  48 #include <sys/vtrace.h>
  49 #include <sys/mode.h>
  50 #include <sys/acl.h>
  51 #include <sys/nbmlock.h>
  52 #include <sys/policy.h>
  53 #include <sys/sdt.h>
  54 
  55 #include <rpc/types.h>
  56 #include <rpc/auth.h>
  57 #include <rpc/svc.h>
  58 
  59 #include <nfs/nfs.h>
  60 #include <nfs/export.h>
  61 #include <nfs/nfs_cmd.h>
  62 
  63 #include <vm/hat.h>
  64 #include <vm/as.h>
  65 #include <vm/seg.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/seg_kmem.h>
  68 
  69 #include <sys/strsubr.h>
  70 
  71 /*
  72  * These are the interface routines for the server side of the
  73  * Network File System.  See the NFS version 2 protocol specification
  74  * for a description of this interface.
  75  */
  76 
  77 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  78 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  79                         cred_t *);
  80 
  81 /*
  82  * Some "over the wire" UNIX file types.  These are encoded
  83  * into the mode.  This needs to be fixed in the next rev.
  84  */
  85 #define IFMT            0170000         /* type of file */
  86 #define IFCHR           0020000         /* character special */
  87 #define IFBLK           0060000         /* block special */
  88 #define IFSOCK          0140000         /* socket */
  89 
  90 u_longlong_t nfs2_srv_caller_id;
  91 
  92 /*
  93  * Get file attributes.
  94  * Returns the current attributes of the file with the given fhandle.
  95  */
  96 /* ARGSUSED */
  97 void
  98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
  99         struct svc_req *req, cred_t *cr)
 100 {
 101         int error;
 102         vnode_t *vp;
 103         struct vattr va;
 104 
 105         vp = nfs_fhtovp(fhp, exi);
 106         if (vp == NULL) {
 107                 ns->ns_status = NFSERR_STALE;
 108                 return;
 109         }
 110 
 111         /*
 112          * Do the getattr.
 113          */
 114         va.va_mask = AT_ALL;    /* we want all the attributes */
 115 
 116         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 117 
 118         /* check for overflows */
 119         if (!error) {
 120                 /* Lie about the object type for a referral */
 121                 if (vn_is_nfs_reparse(vp, cr))
 122                         va.va_type = VLNK;
 123 
 124                 acl_perm(vp, exi, &va, cr);
 125                 error = vattr_to_nattr(&va, &ns->ns_attr);
 126         }
 127 
 128         VN_RELE(vp);
 129 
 130         ns->ns_status = puterrno(error);
 131 }
 132 void *
 133 rfs_getattr_getfh(fhandle_t *fhp)
 134 {
 135         return (fhp);
 136 }
 137 
 138 /*
 139  * Set file attributes.
 140  * Sets the attributes of the file with the given fhandle.  Returns
 141  * the new attributes.
 142  */
 143 void
 144 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 145         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 146 {
 147         int error;
 148         int flag;
 149         int in_crit = 0;
 150         vnode_t *vp;
 151         struct vattr va;
 152         struct vattr bva;
 153         struct flock64 bf;
 154         caller_context_t ct;
 155 
 156 
 157         vp = nfs_fhtovp(&args->saa_fh, exi);
 158         if (vp == NULL) {
 159                 ns->ns_status = NFSERR_STALE;
 160                 return;
 161         }
 162 
 163         if (rdonly(exi, vp, req)) {
 164                 VN_RELE(vp);
 165                 ns->ns_status = NFSERR_ROFS;
 166                 return;
 167         }
 168 
 169         error = sattr_to_vattr(&args->saa_sa, &va);
 170         if (error) {
 171                 VN_RELE(vp);
 172                 ns->ns_status = puterrno(error);
 173                 return;
 174         }
 175 
 176         /*
 177          * If the client is requesting a change to the mtime,
 178          * but the nanosecond field is set to 1 billion, then
 179          * this is a flag to the server that it should set the
 180          * atime and mtime fields to the server's current time.
 181          * The 1 billion number actually came from the client
 182          * as 1 million, but the units in the over the wire
 183          * request are microseconds instead of nanoseconds.
 184          *
 185          * This is an overload of the protocol and should be
 186          * documented in the NFS Version 2 protocol specification.
 187          */
 188         if (va.va_mask & AT_MTIME) {
 189                 if (va.va_mtime.tv_nsec == 1000000000) {
 190                         gethrestime(&va.va_mtime);
 191                         va.va_atime = va.va_mtime;
 192                         va.va_mask |= AT_ATIME;
 193                         flag = 0;
 194                 } else
 195                         flag = ATTR_UTIME;
 196         } else
 197                 flag = 0;
 198 
 199         /*
 200          * If the filesystem is exported with nosuid, then mask off
 201          * the setuid and setgid bits.
 202          */
 203         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 204             (exi->exi_export.ex_flags & EX_NOSUID))
 205                 va.va_mode &= ~(VSUID | VSGID);
 206 
 207         ct.cc_sysid = 0;
 208         ct.cc_pid = 0;
 209         ct.cc_caller_id = nfs2_srv_caller_id;
 210         ct.cc_flags = CC_DONTBLOCK;
 211 
 212         /*
 213          * We need to specially handle size changes because it is
 214          * possible for the client to create a file with modes
 215          * which indicate read-only, but with the file opened for
 216          * writing.  If the client then tries to set the size of
 217          * the file, then the normal access checking done in
 218          * VOP_SETATTR would prevent the client from doing so,
 219          * although it should be legal for it to do so.  To get
 220          * around this, we do the access checking for ourselves
 221          * and then use VOP_SPACE which doesn't do the access
 222          * checking which VOP_SETATTR does. VOP_SPACE can only
 223          * operate on VREG files, let VOP_SETATTR handle the other
 224          * extremely rare cases.
 225          * Also the client should not be allowed to change the
 226          * size of the file if there is a conflicting non-blocking
 227          * mandatory lock in the region of change.
 228          */
 229         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 230                 if (nbl_need_check(vp)) {
 231                         nbl_start_crit(vp, RW_READER);
 232                         in_crit = 1;
 233                 }
 234 
 235                 bva.va_mask = AT_UID | AT_SIZE;
 236 
 237                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 238 
 239                 if (error) {
 240                         if (in_crit)
 241                                 nbl_end_crit(vp);
 242                         VN_RELE(vp);
 243                         ns->ns_status = puterrno(error);
 244                         return;
 245                 }
 246 
 247                 if (in_crit) {
 248                         u_offset_t offset;
 249                         ssize_t length;
 250 
 251                         if (va.va_size < bva.va_size) {
 252                                 offset = va.va_size;
 253                                 length = bva.va_size - va.va_size;
 254                         } else {
 255                                 offset = bva.va_size;
 256                                 length = va.va_size - bva.va_size;
 257                         }
 258                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 259                             NULL)) {
 260                                 error = EACCES;
 261                         }
 262                 }
 263 
 264                 if (crgetuid(cr) == bva.va_uid && !error &&
 265                     va.va_size != bva.va_size) {
 266                         va.va_mask &= ~AT_SIZE;
 267                         bf.l_type = F_WRLCK;
 268                         bf.l_whence = 0;
 269                         bf.l_start = (off64_t)va.va_size;
 270                         bf.l_len = 0;
 271                         bf.l_sysid = 0;
 272                         bf.l_pid = 0;
 273 
 274                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 275                             (offset_t)va.va_size, cr, &ct);
 276                 }
 277                 if (in_crit)
 278                         nbl_end_crit(vp);
 279         } else
 280                 error = 0;
 281 
 282         /*
 283          * Do the setattr.
 284          */
 285         if (!error && va.va_mask) {
 286                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 287         }
 288 
 289         /*
 290          * check if the monitor on either vop_space or vop_setattr detected
 291          * a delegation conflict and if so, mark the thread flag as
 292          * wouldblock so that the response is dropped and the client will
 293          * try again.
 294          */
 295         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 296                 VN_RELE(vp);
 297                 curthread->t_flag |= T_WOULDBLOCK;
 298                 return;
 299         }
 300 
 301         if (!error) {
 302                 va.va_mask = AT_ALL;    /* get everything */
 303 
 304                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 305 
 306                 /* check for overflows */
 307                 if (!error) {
 308                         acl_perm(vp, exi, &va, cr);
 309                         error = vattr_to_nattr(&va, &ns->ns_attr);
 310                 }
 311         }
 312 
 313         ct.cc_flags = 0;
 314 
 315         /*
 316          * Force modified metadata out to stable storage.
 317          */
 318         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 319 
 320         VN_RELE(vp);
 321 
 322         ns->ns_status = puterrno(error);
 323 }
 324 void *
 325 rfs_setattr_getfh(struct nfssaargs *args)
 326 {
 327         return (&args->saa_fh);
 328 }
 329 
 330 /*
 331  * Directory lookup.
 332  * Returns an fhandle and file attributes for file name in a directory.
 333  */
 334 /* ARGSUSED */
 335 void
 336 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 337         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 338 {
 339         int error;
 340         vnode_t *dvp;
 341         vnode_t *vp;
 342         struct vattr va;
 343         fhandle_t *fhp = da->da_fhandle;
 344         struct sec_ol sec = {0, 0};
 345         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 346         char *name;
 347         struct sockaddr *ca;
 348 
 349         /*
 350          * Trusted Extension doesn't support NFSv2. MOUNT
 351          * will reject v2 clients. Need to prevent v2 client
 352          * access via WebNFS here.
 353          */
 354         if (is_system_labeled() && req->rq_vers == 2) {
 355                 dr->dr_status = NFSERR_ACCES;
 356                 return;
 357         }
 358 
 359         /*
 360          * Disallow NULL paths
 361          */
 362         if (da->da_name == NULL || *da->da_name == '\0') {
 363                 dr->dr_status = NFSERR_ACCES;
 364                 return;
 365         }
 366 
 367         /*
 368          * Allow lookups from the root - the default
 369          * location of the public filehandle.
 370          */
 371         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 372                 dvp = rootdir;
 373                 VN_HOLD(dvp);
 374         } else {
 375                 dvp = nfs_fhtovp(fhp, exi);
 376                 if (dvp == NULL) {
 377                         dr->dr_status = NFSERR_STALE;
 378                         return;
 379                 }
 380         }
 381 
 382         /*
 383          * Not allow lookup beyond root.
 384          * If the filehandle matches a filehandle of the exi,
 385          * then the ".." refers beyond the root of an exported filesystem.
 386          */
 387         if (strcmp(da->da_name, "..") == 0 &&
 388             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 389                 VN_RELE(dvp);
 390                 dr->dr_status = NFSERR_NOENT;
 391                 return;
 392         }
 393 
 394         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 395         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 396             MAXPATHLEN);
 397 
 398         if (name == NULL) {
 399                 dr->dr_status = NFSERR_ACCES;
 400                 return;
 401         }
 402 
 403         /*
 404          * If the public filehandle is used then allow
 405          * a multi-component lookup, i.e. evaluate
 406          * a pathname and follow symbolic links if
 407          * necessary.
 408          *
 409          * This may result in a vnode in another filesystem
 410          * which is OK as long as the filesystem is exported.
 411          */
 412         if (PUBLIC_FH2(fhp)) {
 413                 publicfh_flag = TRUE;
 414                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 415                     &sec);
 416         } else {
 417                 /*
 418                  * Do a normal single component lookup.
 419                  */
 420                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 421                     NULL, NULL, NULL);
 422         }
 423 
 424         if (name != da->da_name)
 425                 kmem_free(name, MAXPATHLEN);
 426 
 427 
 428         if (!error) {
 429                 va.va_mask = AT_ALL;    /* we want everything */
 430 
 431                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 432 
 433                 /* check for overflows */
 434                 if (!error) {
 435                         acl_perm(vp, exi, &va, cr);
 436                         error = vattr_to_nattr(&va, &dr->dr_attr);
 437                         if (!error) {
 438                                 if (sec.sec_flags & SEC_QUERY)
 439                                         error = makefh_ol(&dr->dr_fhandle, exi,
 440                                             sec.sec_index);
 441                                 else {
 442                                         error = makefh(&dr->dr_fhandle, vp,
 443                                             exi);
 444                                         if (!error && publicfh_flag &&
 445                                             !chk_clnt_sec(exi, req))
 446                                                 auth_weak = TRUE;
 447                                 }
 448                         }
 449                 }
 450                 VN_RELE(vp);
 451         }
 452 
 453         VN_RELE(dvp);
 454 
 455         /*
 456          * If publicfh_flag is true then we have called rfs_publicfh_mclookup
 457          * and have obtained a new exportinfo in exi which needs to be
 458          * released. Note the the original exportinfo pointed to by exi
 459          * will be released by the caller, comon_dispatch.
 460          */
 461         if (publicfh_flag && exi != NULL)
 462                 exi_rele(exi);
 463 
 464         /*
 465          * If it's public fh, no 0x81, and client's flavor is
 466          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 467          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 468          */
 469         if (auth_weak)
 470                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 471         else
 472                 dr->dr_status = puterrno(error);
 473 }
 474 void *
 475 rfs_lookup_getfh(struct nfsdiropargs *da)
 476 {
 477         return (da->da_fhandle);
 478 }
 479 
 480 /*
 481  * Read symbolic link.
 482  * Returns the string in the symbolic link at the given fhandle.
 483  */
 484 /* ARGSUSED */
 485 void
 486 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 487         struct svc_req *req, cred_t *cr)
 488 {
 489         int error;
 490         struct iovec iov;
 491         struct uio uio;
 492         vnode_t *vp;
 493         struct vattr va;
 494         struct sockaddr *ca;
 495         char *name = NULL;
 496         int is_referral = 0;
 497 
 498         vp = nfs_fhtovp(fhp, exi);
 499         if (vp == NULL) {
 500                 rl->rl_data = NULL;
 501                 rl->rl_status = NFSERR_STALE;
 502                 return;
 503         }
 504 
 505         va.va_mask = AT_MODE;
 506 
 507         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 508 
 509         if (error) {
 510                 VN_RELE(vp);
 511                 rl->rl_data = NULL;
 512                 rl->rl_status = puterrno(error);
 513                 return;
 514         }
 515 
 516         if (MANDLOCK(vp, va.va_mode)) {
 517                 VN_RELE(vp);
 518                 rl->rl_data = NULL;
 519                 rl->rl_status = NFSERR_ACCES;
 520                 return;
 521         }
 522 
 523         /* We lied about the object type for a referral */
 524         if (vn_is_nfs_reparse(vp, cr))
 525                 is_referral = 1;
 526 
 527         /*
 528          * XNFS and RFC1094 require us to return ENXIO if argument
 529          * is not a link. BUGID 1138002.
 530          */
 531         if (vp->v_type != VLNK && !is_referral) {
 532                 VN_RELE(vp);
 533                 rl->rl_data = NULL;
 534                 rl->rl_status = NFSERR_NXIO;
 535                 return;
 536         }
 537 
 538         /*
 539          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 540          */
 541         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 542 
 543         if (is_referral) {
 544                 char *s;
 545                 size_t strsz;
 546 
 547                 /* Get an artificial symlink based on a referral */
 548                 s = build_symlink(vp, cr, &strsz);
 549                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 550                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 551                     vnode_t *, vp, char *, s);
 552                 if (s == NULL)
 553                         error = EINVAL;
 554                 else {
 555                         error = 0;
 556                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 557                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 558                         kmem_free(s, strsz);
 559                 }
 560 
 561         } else {
 562 
 563                 /*
 564                  * Set up io vector to read sym link data
 565                  */
 566                 iov.iov_base = rl->rl_data;
 567                 iov.iov_len = NFS_MAXPATHLEN;
 568                 uio.uio_iov = &iov;
 569                 uio.uio_iovcnt = 1;
 570                 uio.uio_segflg = UIO_SYSSPACE;
 571                 uio.uio_extflg = UIO_COPY_CACHED;
 572                 uio.uio_loffset = (offset_t)0;
 573                 uio.uio_resid = NFS_MAXPATHLEN;
 574 
 575                 /*
 576                  * Do the readlink.
 577                  */
 578                 error = VOP_READLINK(vp, &uio, cr, NULL);
 579 
 580                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 581 
 582                 if (!error)
 583                         rl->rl_data[rl->rl_count] = '\0';
 584 
 585         }
 586 
 587 
 588         VN_RELE(vp);
 589 
 590         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 591         name = nfscmd_convname(ca, exi, rl->rl_data,
 592             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 593 
 594         if (name != NULL && name != rl->rl_data) {
 595                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 596                 rl->rl_data = name;
 597         }
 598 
 599         /*
 600          * XNFS and RFC1094 require us to return ENXIO if argument
 601          * is not a link. UFS returns EINVAL if this is the case,
 602          * so we do the mapping here. BUGID 1138002.
 603          */
 604         if (error == EINVAL)
 605                 rl->rl_status = NFSERR_NXIO;
 606         else
 607                 rl->rl_status = puterrno(error);
 608 
 609 }
 610 void *
 611 rfs_readlink_getfh(fhandle_t *fhp)
 612 {
 613         return (fhp);
 614 }
 615 /*
 616  * Free data allocated by rfs_readlink
 617  */
 618 void
 619 rfs_rlfree(struct nfsrdlnres *rl)
 620 {
 621         if (rl->rl_data != NULL)
 622                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 623 }
 624 
 625 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 626 
 627 /*
 628  * Read data.
 629  * Returns some data read from the file at the given fhandle.
 630  */
 631 /* ARGSUSED */
 632 void
 633 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 634         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 635 {
 636         vnode_t *vp;
 637         int error;
 638         struct vattr va;
 639         struct iovec iov;
 640         struct uio uio;
 641         mblk_t *mp;
 642         int alloc_err = 0;
 643         int in_crit = 0;
 644         caller_context_t ct;
 645 
 646         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 647         if (vp == NULL) {
 648                 rr->rr_data = NULL;
 649                 rr->rr_status = NFSERR_STALE;
 650                 return;
 651         }
 652 
 653         if (vp->v_type != VREG) {
 654                 VN_RELE(vp);
 655                 rr->rr_data = NULL;
 656                 rr->rr_status = NFSERR_ISDIR;
 657                 return;
 658         }
 659 
 660         ct.cc_sysid = 0;
 661         ct.cc_pid = 0;
 662         ct.cc_caller_id = nfs2_srv_caller_id;
 663         ct.cc_flags = CC_DONTBLOCK;
 664 
 665         /*
 666          * Enter the critical region before calling VOP_RWLOCK
 667          * to avoid a deadlock with write requests.
 668          */
 669         if (nbl_need_check(vp)) {
 670                 nbl_start_crit(vp, RW_READER);
 671                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 672                     0, NULL)) {
 673                         nbl_end_crit(vp);
 674                         VN_RELE(vp);
 675                         rr->rr_data = NULL;
 676                         rr->rr_status = NFSERR_ACCES;
 677                         return;
 678                 }
 679                 in_crit = 1;
 680         }
 681 
 682         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 683 
 684         /* check if a monitor detected a delegation conflict */
 685         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 686                 VN_RELE(vp);
 687                 /* mark as wouldblock so response is dropped */
 688                 curthread->t_flag |= T_WOULDBLOCK;
 689 
 690                 rr->rr_data = NULL;
 691                 return;
 692         }
 693 
 694         va.va_mask = AT_ALL;
 695 
 696         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 697 
 698         if (error) {
 699                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 700                 if (in_crit)
 701                         nbl_end_crit(vp);
 702 
 703                 VN_RELE(vp);
 704                 rr->rr_data = NULL;
 705                 rr->rr_status = puterrno(error);
 706 
 707                 return;
 708         }
 709 
 710         /*
 711          * This is a kludge to allow reading of files created
 712          * with no read permission.  The owner of the file
 713          * is always allowed to read it.
 714          */
 715         if (crgetuid(cr) != va.va_uid) {
 716                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 717 
 718                 if (error) {
 719                         /*
 720                          * Exec is the same as read over the net because
 721                          * of demand loading.
 722                          */
 723                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 724                 }
 725                 if (error) {
 726                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 727                         if (in_crit)
 728                                 nbl_end_crit(vp);
 729                         VN_RELE(vp);
 730                         rr->rr_data = NULL;
 731                         rr->rr_status = puterrno(error);
 732 
 733                         return;
 734                 }
 735         }
 736 
 737         if (MANDLOCK(vp, va.va_mode)) {
 738                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 739                 if (in_crit)
 740                         nbl_end_crit(vp);
 741 
 742                 VN_RELE(vp);
 743                 rr->rr_data = NULL;
 744                 rr->rr_status = NFSERR_ACCES;
 745 
 746                 return;
 747         }
 748 
 749         rr->rr_ok.rrok_wlist_len = 0;
 750         rr->rr_ok.rrok_wlist = NULL;
 751 
 752         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 753                 rr->rr_count = 0;
 754                 rr->rr_data = NULL;
 755                 /*
 756                  * In this case, status is NFS_OK, but there is no data
 757                  * to encode. So set rr_mp to NULL.
 758                  */
 759                 rr->rr_mp = NULL;
 760                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 761                 if (rr->rr_ok.rrok_wlist)
 762                         clist_zero_len(rr->rr_ok.rrok_wlist);
 763                 goto done;
 764         }
 765 
 766         if (ra->ra_wlist) {
 767                 mp = NULL;
 768                 rr->rr_mp = NULL;
 769                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 770                 if (ra->ra_count > iov.iov_len) {
 771                         rr->rr_data = NULL;
 772                         rr->rr_status = NFSERR_INVAL;
 773                         goto done;
 774                 }
 775         } else {
 776                 /*
 777                  * mp will contain the data to be sent out in the read reply.
 778                  * This will be freed after the reply has been sent out (by the
 779                  * driver).
 780                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 781                  * that the call to xdrmblk_putmblk() never fails.
 782                  */
 783                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 784                     &alloc_err);
 785                 ASSERT(mp != NULL);
 786                 ASSERT(alloc_err == 0);
 787 
 788                 rr->rr_mp = mp;
 789 
 790                 /*
 791                  * Set up io vector
 792                  */
 793                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 794                 iov.iov_len = ra->ra_count;
 795         }
 796 
 797         uio.uio_iov = &iov;
 798         uio.uio_iovcnt = 1;
 799         uio.uio_segflg = UIO_SYSSPACE;
 800         uio.uio_extflg = UIO_COPY_CACHED;
 801         uio.uio_loffset = (offset_t)ra->ra_offset;
 802         uio.uio_resid = ra->ra_count;
 803 
 804         error = VOP_READ(vp, &uio, 0, cr, &ct);
 805 
 806         if (error) {
 807                 if (mp)
 808                         freeb(mp);
 809 
 810                 /*
 811                  * check if a monitor detected a delegation conflict and
 812                  * mark as wouldblock so response is dropped
 813                  */
 814                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 815                         curthread->t_flag |= T_WOULDBLOCK;
 816                 else
 817                         rr->rr_status = puterrno(error);
 818 
 819                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 820                 if (in_crit)
 821                         nbl_end_crit(vp);
 822 
 823                 VN_RELE(vp);
 824                 rr->rr_data = NULL;
 825 
 826                 return;
 827         }
 828 
 829         /*
 830          * Get attributes again so we can send the latest access
 831          * time to the client side for his cache.
 832          */
 833         va.va_mask = AT_ALL;
 834 
 835         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 836 
 837         if (error) {
 838                 if (mp)
 839                         freeb(mp);
 840 
 841                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 842                 if (in_crit)
 843                         nbl_end_crit(vp);
 844 
 845                 VN_RELE(vp);
 846                 rr->rr_data = NULL;
 847                 rr->rr_status = puterrno(error);
 848 
 849                 return;
 850         }
 851 
 852         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 853 
 854         if (mp) {
 855                 rr->rr_data = (char *)mp->b_datap->db_base;
 856         } else {
 857                 if (ra->ra_wlist) {
 858                         rr->rr_data = (caddr_t)iov.iov_base;
 859                         if (!rdma_setup_read_data2(ra, rr)) {
 860                                 rr->rr_data = NULL;
 861                                 rr->rr_status = puterrno(NFSERR_INVAL);
 862                         }
 863                 }
 864         }
 865 done:
 866         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 867         if (in_crit)
 868                 nbl_end_crit(vp);
 869 
 870         acl_perm(vp, exi, &va, cr);
 871 
 872         /* check for overflows */
 873         error = vattr_to_nattr(&va, &rr->rr_attr);
 874 
 875         VN_RELE(vp);
 876 
 877         rr->rr_status = puterrno(error);
 878 }
 879 
 880 /*
 881  * Free data allocated by rfs_read
 882  */
 883 void
 884 rfs_rdfree(struct nfsrdresult *rr)
 885 {
 886         mblk_t *mp;
 887 
 888         if (rr->rr_status == NFS_OK) {
 889                 mp = rr->rr_mp;
 890                 if (mp != NULL)
 891                         freeb(mp);
 892         }
 893 }
 894 
 895 void *
 896 rfs_read_getfh(struct nfsreadargs *ra)
 897 {
 898         return (&ra->ra_fhandle);
 899 }
 900 
 901 #define MAX_IOVECS      12
 902 
 903 #ifdef DEBUG
 904 static int rfs_write_sync_hits = 0;
 905 static int rfs_write_sync_misses = 0;
 906 #endif
 907 
 908 /*
 909  * Write data to file.
 910  * Returns attributes of a file after writing some data to it.
 911  *
 912  * Any changes made here, especially in error handling might have
 913  * to also be done in rfs_write (which clusters write requests).
 914  */
 915 void
 916 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
 917         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 918 {
 919         int error;
 920         vnode_t *vp;
 921         rlim64_t rlimit;
 922         struct vattr va;
 923         struct uio uio;
 924         struct iovec iov[MAX_IOVECS];
 925         mblk_t *m;
 926         struct iovec *iovp;
 927         int iovcnt;
 928         cred_t *savecred;
 929         int in_crit = 0;
 930         caller_context_t ct;
 931 
 932         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
 933         if (vp == NULL) {
 934                 ns->ns_status = NFSERR_STALE;
 935                 return;
 936         }
 937 
 938         if (rdonly(exi, vp, req)) {
 939                 VN_RELE(vp);
 940                 ns->ns_status = NFSERR_ROFS;
 941                 return;
 942         }
 943 
 944         if (vp->v_type != VREG) {
 945                 VN_RELE(vp);
 946                 ns->ns_status = NFSERR_ISDIR;
 947                 return;
 948         }
 949 
 950         ct.cc_sysid = 0;
 951         ct.cc_pid = 0;
 952         ct.cc_caller_id = nfs2_srv_caller_id;
 953         ct.cc_flags = CC_DONTBLOCK;
 954 
 955         va.va_mask = AT_UID|AT_MODE;
 956 
 957         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 958 
 959         if (error) {
 960                 VN_RELE(vp);
 961                 ns->ns_status = puterrno(error);
 962 
 963                 return;
 964         }
 965 
 966         if (crgetuid(cr) != va.va_uid) {
 967                 /*
 968                  * This is a kludge to allow writes of files created
 969                  * with read only permission.  The owner of the file
 970                  * is always allowed to write it.
 971                  */
 972                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
 973 
 974                 if (error) {
 975                         VN_RELE(vp);
 976                         ns->ns_status = puterrno(error);
 977                         return;
 978                 }
 979         }
 980 
 981         /*
 982          * Can't access a mandatory lock file.  This might cause
 983          * the NFS service thread to block forever waiting for a
 984          * lock to be released that will never be released.
 985          */
 986         if (MANDLOCK(vp, va.va_mode)) {
 987                 VN_RELE(vp);
 988                 ns->ns_status = NFSERR_ACCES;
 989                 return;
 990         }
 991 
 992         /*
 993          * We have to enter the critical region before calling VOP_RWLOCK
 994          * to avoid a deadlock with ufs.
 995          */
 996         if (nbl_need_check(vp)) {
 997                 nbl_start_crit(vp, RW_READER);
 998                 in_crit = 1;
 999                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1000                     wa->wa_count, 0, NULL)) {
1001                         error = EACCES;
1002                         goto out;
1003                 }
1004         }
1005 
1006         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1007 
1008         /* check if a monitor detected a delegation conflict */
1009         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1010                 VN_RELE(vp);
1011                 /* mark as wouldblock so response is dropped */
1012                 curthread->t_flag |= T_WOULDBLOCK;
1013                 return;
1014         }
1015 
1016         if (wa->wa_data || wa->wa_rlist) {
1017                 /* Do the RDMA thing if necessary */
1018                 if (wa->wa_rlist) {
1019                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1020                         iov[0].iov_len = wa->wa_count;
1021                 } else  {
1022                         iov[0].iov_base = wa->wa_data;
1023                         iov[0].iov_len = wa->wa_count;
1024                 }
1025                 uio.uio_iov = iov;
1026                 uio.uio_iovcnt = 1;
1027                 uio.uio_segflg = UIO_SYSSPACE;
1028                 uio.uio_extflg = UIO_COPY_DEFAULT;
1029                 uio.uio_loffset = (offset_t)wa->wa_offset;
1030                 uio.uio_resid = wa->wa_count;
1031                 /*
1032                  * The limit is checked on the client. We
1033                  * should allow any size writes here.
1034                  */
1035                 uio.uio_llimit = curproc->p_fsz_ctl;
1036                 rlimit = uio.uio_llimit - wa->wa_offset;
1037                 if (rlimit < (rlim64_t)uio.uio_resid)
1038                         uio.uio_resid = (uint_t)rlimit;
1039 
1040                 /*
1041                  * for now we assume no append mode
1042                  */
1043                 /*
1044                  * We're changing creds because VM may fault and we need
1045                  * the cred of the current thread to be used if quota
1046                  * checking is enabled.
1047                  */
1048                 savecred = curthread->t_cred;
1049                 curthread->t_cred = cr;
1050                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1051                 curthread->t_cred = savecred;
1052         } else {
1053                 iovcnt = 0;
1054                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1055                         iovcnt++;
1056                 if (iovcnt <= MAX_IOVECS) {
1057 #ifdef DEBUG
1058                         rfs_write_sync_hits++;
1059 #endif
1060                         iovp = iov;
1061                 } else {
1062 #ifdef DEBUG
1063                         rfs_write_sync_misses++;
1064 #endif
1065                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1066                 }
1067                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1068                 uio.uio_iov = iovp;
1069                 uio.uio_iovcnt = iovcnt;
1070                 uio.uio_segflg = UIO_SYSSPACE;
1071                 uio.uio_extflg = UIO_COPY_DEFAULT;
1072                 uio.uio_loffset = (offset_t)wa->wa_offset;
1073                 uio.uio_resid = wa->wa_count;
1074                 /*
1075                  * The limit is checked on the client. We
1076                  * should allow any size writes here.
1077                  */
1078                 uio.uio_llimit = curproc->p_fsz_ctl;
1079                 rlimit = uio.uio_llimit - wa->wa_offset;
1080                 if (rlimit < (rlim64_t)uio.uio_resid)
1081                         uio.uio_resid = (uint_t)rlimit;
1082 
1083                 /*
1084                  * For now we assume no append mode.
1085                  */
1086                 /*
1087                  * We're changing creds because VM may fault and we need
1088                  * the cred of the current thread to be used if quota
1089                  * checking is enabled.
1090                  */
1091                 savecred = curthread->t_cred;
1092                 curthread->t_cred = cr;
1093                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1094                 curthread->t_cred = savecred;
1095 
1096                 if (iovp != iov)
1097                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1098         }
1099 
1100         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1101 
1102         if (!error) {
1103                 /*
1104                  * Get attributes again so we send the latest mod
1105                  * time to the client side for his cache.
1106                  */
1107                 va.va_mask = AT_ALL;    /* now we want everything */
1108 
1109                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1110 
1111                 /* check for overflows */
1112                 if (!error) {
1113                         acl_perm(vp, exi, &va, cr);
1114                         error = vattr_to_nattr(&va, &ns->ns_attr);
1115                 }
1116         }
1117 
1118 out:
1119         if (in_crit)
1120                 nbl_end_crit(vp);
1121         VN_RELE(vp);
1122 
1123         /* check if a monitor detected a delegation conflict */
1124         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1125                 /* mark as wouldblock so response is dropped */
1126                 curthread->t_flag |= T_WOULDBLOCK;
1127         else
1128                 ns->ns_status = puterrno(error);
1129 
1130 }
1131 
1132 struct rfs_async_write {
1133         struct nfswriteargs *wa;
1134         struct nfsattrstat *ns;
1135         struct svc_req *req;
1136         cred_t *cr;
1137         kthread_t *thread;
1138         struct rfs_async_write *list;
1139 };
1140 
1141 struct rfs_async_write_list {
1142         fhandle_t *fhp;
1143         kcondvar_t cv;
1144         struct rfs_async_write *list;
1145         struct rfs_async_write_list *next;
1146 };
1147 
1148 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1149 static kmutex_t rfs_async_write_lock;
1150 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1151 
1152 #define MAXCLIOVECS     42
1153 #define RFSWRITE_INITVAL (enum nfsstat) -1
1154 
1155 #ifdef DEBUG
1156 static int rfs_write_hits = 0;
1157 static int rfs_write_misses = 0;
1158 #endif
1159 
1160 /*
1161  * Write data to file.
1162  * Returns attributes of a file after writing some data to it.
1163  */
1164 void
1165 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1166         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1167 {
1168         int error;
1169         vnode_t *vp;
1170         rlim64_t rlimit;
1171         struct vattr va;
1172         struct uio uio;
1173         struct rfs_async_write_list *lp;
1174         struct rfs_async_write_list *nlp;
1175         struct rfs_async_write *rp;
1176         struct rfs_async_write *nrp;
1177         struct rfs_async_write *trp;
1178         struct rfs_async_write *lrp;
1179         int data_written;
1180         int iovcnt;
1181         mblk_t *m;
1182         struct iovec *iovp;
1183         struct iovec *niovp;
1184         struct iovec iov[MAXCLIOVECS];
1185         int count;
1186         int rcount;
1187         uint_t off;
1188         uint_t len;
1189         struct rfs_async_write nrpsp;
1190         struct rfs_async_write_list nlpsp;
1191         ushort_t t_flag;
1192         cred_t *savecred;
1193         int in_crit = 0;
1194         caller_context_t ct;
1195 
1196         if (!rfs_write_async) {
1197                 rfs_write_sync(wa, ns, exi, req, cr);
1198                 return;
1199         }
1200 
1201         /*
1202          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1203          * is considered an OK.
1204          */
1205         ns->ns_status = RFSWRITE_INITVAL;
1206 
1207         nrp = &nrpsp;
1208         nrp->wa = wa;
1209         nrp->ns = ns;
1210         nrp->req = req;
1211         nrp->cr = cr;
1212         nrp->thread = curthread;
1213 
1214         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1215 
1216         /*
1217          * Look to see if there is already a cluster started
1218          * for this file.
1219          */
1220         mutex_enter(&rfs_async_write_lock);
1221         for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1222                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1223                     sizeof (fhandle_t)) == 0)
1224                         break;
1225         }
1226 
1227         /*
1228          * If lp is non-NULL, then there is already a cluster
1229          * started.  We need to place ourselves in the cluster
1230          * list in the right place as determined by starting
1231          * offset.  Conflicts with non-blocking mandatory locked
1232          * regions will be checked when the cluster is processed.
1233          */
1234         if (lp != NULL) {
1235                 rp = lp->list;
1236                 trp = NULL;
1237                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1238                         trp = rp;
1239                         rp = rp->list;
1240                 }
1241                 nrp->list = rp;
1242                 if (trp == NULL)
1243                         lp->list = nrp;
1244                 else
1245                         trp->list = nrp;
1246                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1247                         cv_wait(&lp->cv, &rfs_async_write_lock);
1248                 mutex_exit(&rfs_async_write_lock);
1249 
1250                 return;
1251         }
1252 
1253         /*
1254          * No cluster started yet, start one and add ourselves
1255          * to the list of clusters.
1256          */
1257         nrp->list = NULL;
1258 
1259         nlp = &nlpsp;
1260         nlp->fhp = &wa->wa_fhandle;
1261         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1262         nlp->list = nrp;
1263         nlp->next = NULL;
1264 
1265         if (rfs_async_write_head == NULL) {
1266                 rfs_async_write_head = nlp;
1267         } else {
1268                 lp = rfs_async_write_head;
1269                 while (lp->next != NULL)
1270                         lp = lp->next;
1271                 lp->next = nlp;
1272         }
1273         mutex_exit(&rfs_async_write_lock);
1274 
1275         /*
1276          * Convert the file handle common to all of the requests
1277          * in this cluster to a vnode.
1278          */
1279         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1280         if (vp == NULL) {
1281                 mutex_enter(&rfs_async_write_lock);
1282                 if (rfs_async_write_head == nlp)
1283                         rfs_async_write_head = nlp->next;
1284                 else {
1285                         lp = rfs_async_write_head;
1286                         while (lp->next != nlp)
1287                                 lp = lp->next;
1288                         lp->next = nlp->next;
1289                 }
1290                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1291                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1292                         rp->ns->ns_status = NFSERR_STALE;
1293                         rp->thread->t_flag |= t_flag;
1294                 }
1295                 cv_broadcast(&nlp->cv);
1296                 mutex_exit(&rfs_async_write_lock);
1297 
1298                 return;
1299         }
1300 
1301         /*
1302          * Can only write regular files.  Attempts to write any
1303          * other file types fail with EISDIR.
1304          */
1305         if (vp->v_type != VREG) {
1306                 VN_RELE(vp);
1307                 mutex_enter(&rfs_async_write_lock);
1308                 if (rfs_async_write_head == nlp)
1309                         rfs_async_write_head = nlp->next;
1310                 else {
1311                         lp = rfs_async_write_head;
1312                         while (lp->next != nlp)
1313                                 lp = lp->next;
1314                         lp->next = nlp->next;
1315                 }
1316                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1317                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1318                         rp->ns->ns_status = NFSERR_ISDIR;
1319                         rp->thread->t_flag |= t_flag;
1320                 }
1321                 cv_broadcast(&nlp->cv);
1322                 mutex_exit(&rfs_async_write_lock);
1323 
1324                 return;
1325         }
1326 
1327         /*
1328          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1329          * deadlock with ufs.
1330          */
1331         if (nbl_need_check(vp)) {
1332                 nbl_start_crit(vp, RW_READER);
1333                 in_crit = 1;
1334         }
1335 
1336         ct.cc_sysid = 0;
1337         ct.cc_pid = 0;
1338         ct.cc_caller_id = nfs2_srv_caller_id;
1339         ct.cc_flags = CC_DONTBLOCK;
1340 
1341         /*
1342          * Lock the file for writing.  This operation provides
1343          * the delay which allows clusters to grow.
1344          */
1345         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1346 
1347         /* check if a monitor detected a delegation conflict */
1348         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1349                 if (in_crit)
1350                         nbl_end_crit(vp);
1351                 VN_RELE(vp);
1352                 /* mark as wouldblock so response is dropped */
1353                 curthread->t_flag |= T_WOULDBLOCK;
1354                 mutex_enter(&rfs_async_write_lock);
1355                 if (rfs_async_write_head == nlp)
1356                         rfs_async_write_head = nlp->next;
1357                 else {
1358                         lp = rfs_async_write_head;
1359                         while (lp->next != nlp)
1360                                 lp = lp->next;
1361                         lp->next = nlp->next;
1362                 }
1363                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1364                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1365                                 rp->ns->ns_status = puterrno(error);
1366                                 rp->thread->t_flag |= T_WOULDBLOCK;
1367                         }
1368                 }
1369                 cv_broadcast(&nlp->cv);
1370                 mutex_exit(&rfs_async_write_lock);
1371 
1372                 return;
1373         }
1374 
1375         /*
1376          * Disconnect this cluster from the list of clusters.
1377          * The cluster that is being dealt with must be fixed
1378          * in size after this point, so there is no reason
1379          * to leave it on the list so that new requests can
1380          * find it.
1381          *
1382          * The algorithm is that the first write request will
1383          * create a cluster, convert the file handle to a
1384          * vnode pointer, and then lock the file for writing.
1385          * This request is not likely to be clustered with
1386          * any others.  However, the next request will create
1387          * a new cluster and be blocked in VOP_RWLOCK while
1388          * the first request is being processed.  This delay
1389          * will allow more requests to be clustered in this
1390          * second cluster.
1391          */
1392         mutex_enter(&rfs_async_write_lock);
1393         if (rfs_async_write_head == nlp)
1394                 rfs_async_write_head = nlp->next;
1395         else {
1396                 lp = rfs_async_write_head;
1397                 while (lp->next != nlp)
1398                         lp = lp->next;
1399                 lp->next = nlp->next;
1400         }
1401         mutex_exit(&rfs_async_write_lock);
1402 
1403         /*
1404          * Step through the list of requests in this cluster.
1405          * We need to check permissions to make sure that all
1406          * of the requests have sufficient permission to write
1407          * the file.  A cluster can be composed of requests
1408          * from different clients and different users on each
1409          * client.
1410          *
1411          * As a side effect, we also calculate the size of the
1412          * byte range that this cluster encompasses.
1413          */
1414         rp = nlp->list;
1415         off = rp->wa->wa_offset;
1416         len = (uint_t)0;
1417         do {
1418                 if (rdonly(exi, vp, rp->req)) {
1419                         rp->ns->ns_status = NFSERR_ROFS;
1420                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1421                         rp->thread->t_flag |= t_flag;
1422                         continue;
1423                 }
1424 
1425                 va.va_mask = AT_UID|AT_MODE;
1426 
1427                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1428 
1429                 if (!error) {
1430                         if (crgetuid(rp->cr) != va.va_uid) {
1431                                 /*
1432                                  * This is a kludge to allow writes of files
1433                                  * created with read only permission.  The
1434                                  * owner of the file is always allowed to
1435                                  * write it.
1436                                  */
1437                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1438                         }
1439                         if (!error && MANDLOCK(vp, va.va_mode))
1440                                 error = EACCES;
1441                 }
1442 
1443                 /*
1444                  * Check for a conflict with a nbmand-locked region.
1445                  */
1446                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1447                     rp->wa->wa_count, 0, NULL)) {
1448                         error = EACCES;
1449                 }
1450 
1451                 if (error) {
1452                         rp->ns->ns_status = puterrno(error);
1453                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1454                         rp->thread->t_flag |= t_flag;
1455                         continue;
1456                 }
1457                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1458                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1459         } while ((rp = rp->list) != NULL);
1460 
1461         /*
1462          * Step through the cluster attempting to gather as many
1463          * requests which are contiguous as possible.  These
1464          * contiguous requests are handled via one call to VOP_WRITE
1465          * instead of different calls to VOP_WRITE.  We also keep
1466          * track of the fact that any data was written.
1467          */
1468         rp = nlp->list;
1469         data_written = 0;
1470         do {
1471                 /*
1472                  * Skip any requests which are already marked as having an
1473                  * error.
1474                  */
1475                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1476                         rp = rp->list;
1477                         continue;
1478                 }
1479 
1480                 /*
1481                  * Count the number of iovec's which are required
1482                  * to handle this set of requests.  One iovec is
1483                  * needed for each data buffer, whether addressed
1484                  * by wa_data or by the b_rptr pointers in the
1485                  * mblk chains.
1486                  */
1487                 iovcnt = 0;
1488                 lrp = rp;
1489                 for (;;) {
1490                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1491                                 iovcnt++;
1492                         else {
1493                                 m = lrp->wa->wa_mblk;
1494                                 while (m != NULL) {
1495                                         iovcnt++;
1496                                         m = m->b_cont;
1497                                 }
1498                         }
1499                         if (lrp->list == NULL ||
1500                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1501                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1502                             lrp->list->wa->wa_offset) {
1503                                 lrp = lrp->list;
1504                                 break;
1505                         }
1506                         lrp = lrp->list;
1507                 }
1508 
1509                 if (iovcnt <= MAXCLIOVECS) {
1510 #ifdef DEBUG
1511                         rfs_write_hits++;
1512 #endif
1513                         niovp = iov;
1514                 } else {
1515 #ifdef DEBUG
1516                         rfs_write_misses++;
1517 #endif
1518                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1519                 }
1520                 /*
1521                  * Put together the scatter/gather iovecs.
1522                  */
1523                 iovp = niovp;
1524                 trp = rp;
1525                 count = 0;
1526                 do {
1527                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1528                                 if (trp->wa->wa_rlist) {
1529                                         iovp->iov_base =
1530                                             (char *)((trp->wa->wa_rlist)->
1531                                             u.c_daddr3);
1532                                         iovp->iov_len = trp->wa->wa_count;
1533                                 } else  {
1534                                         iovp->iov_base = trp->wa->wa_data;
1535                                         iovp->iov_len = trp->wa->wa_count;
1536                                 }
1537                                 iovp++;
1538                         } else {
1539                                 m = trp->wa->wa_mblk;
1540                                 rcount = trp->wa->wa_count;
1541                                 while (m != NULL) {
1542                                         iovp->iov_base = (caddr_t)m->b_rptr;
1543                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1544                                         rcount -= iovp->iov_len;
1545                                         if (rcount < 0)
1546                                                 iovp->iov_len += rcount;
1547                                         iovp++;
1548                                         if (rcount <= 0)
1549                                                 break;
1550                                         m = m->b_cont;
1551                                 }
1552                         }
1553                         count += trp->wa->wa_count;
1554                         trp = trp->list;
1555                 } while (trp != lrp);
1556 
1557                 uio.uio_iov = niovp;
1558                 uio.uio_iovcnt = iovcnt;
1559                 uio.uio_segflg = UIO_SYSSPACE;
1560                 uio.uio_extflg = UIO_COPY_DEFAULT;
1561                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1562                 uio.uio_resid = count;
1563                 /*
1564                  * The limit is checked on the client. We
1565                  * should allow any size writes here.
1566                  */
1567                 uio.uio_llimit = curproc->p_fsz_ctl;
1568                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1569                 if (rlimit < (rlim64_t)uio.uio_resid)
1570                         uio.uio_resid = (uint_t)rlimit;
1571 
1572                 /*
1573                  * For now we assume no append mode.
1574                  */
1575 
1576                 /*
1577                  * We're changing creds because VM may fault
1578                  * and we need the cred of the current
1579                  * thread to be used if quota * checking is
1580                  * enabled.
1581                  */
1582                 savecred = curthread->t_cred;
1583                 curthread->t_cred = cr;
1584                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1585                 curthread->t_cred = savecred;
1586 
1587                 /* check if a monitor detected a delegation conflict */
1588                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1589                         /* mark as wouldblock so response is dropped */
1590                         curthread->t_flag |= T_WOULDBLOCK;
1591 
1592                 if (niovp != iov)
1593                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1594 
1595                 if (!error) {
1596                         data_written = 1;
1597                         /*
1598                          * Get attributes again so we send the latest mod
1599                          * time to the client side for his cache.
1600                          */
1601                         va.va_mask = AT_ALL;    /* now we want everything */
1602 
1603                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1604 
1605                         if (!error)
1606                                 acl_perm(vp, exi, &va, rp->cr);
1607                 }
1608 
1609                 /*
1610                  * Fill in the status responses for each request
1611                  * which was just handled.  Also, copy the latest
1612                  * attributes in to the attribute responses if
1613                  * appropriate.
1614                  */
1615                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1616                 do {
1617                         rp->thread->t_flag |= t_flag;
1618                         /* check for overflows */
1619                         if (!error) {
1620                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1621                         }
1622                         rp->ns->ns_status = puterrno(error);
1623                         rp = rp->list;
1624                 } while (rp != lrp);
1625         } while (rp != NULL);
1626 
1627         /*
1628          * If any data was written at all, then we need to flush
1629          * the data and metadata to stable storage.
1630          */
1631         if (data_written) {
1632                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1633 
1634                 if (!error) {
1635                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1636                 }
1637         }
1638 
1639         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1640 
1641         if (in_crit)
1642                 nbl_end_crit(vp);
1643         VN_RELE(vp);
1644 
1645         t_flag = curthread->t_flag & T_WOULDBLOCK;
1646         mutex_enter(&rfs_async_write_lock);
1647         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1648                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1649                         rp->ns->ns_status = puterrno(error);
1650                         rp->thread->t_flag |= t_flag;
1651                 }
1652         }
1653         cv_broadcast(&nlp->cv);
1654         mutex_exit(&rfs_async_write_lock);
1655 
1656 }
1657 
1658 void *
1659 rfs_write_getfh(struct nfswriteargs *wa)
1660 {
1661         return (&wa->wa_fhandle);
1662 }
1663 
1664 /*
1665  * Create a file.
1666  * Creates a file with given attributes and returns those attributes
1667  * and an fhandle for the new file.
1668  */
1669 void
1670 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1671         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1672 {
1673         int error;
1674         int lookuperr;
1675         int in_crit = 0;
1676         struct vattr va;
1677         vnode_t *vp;
1678         vnode_t *realvp;
1679         vnode_t *dvp;
1680         char *name = args->ca_da.da_name;
1681         vnode_t *tvp = NULL;
1682         int mode;
1683         int lookup_ok;
1684         bool_t trunc;
1685         struct sockaddr *ca;
1686 
1687         /*
1688          * Disallow NULL paths
1689          */
1690         if (name == NULL || *name == '\0') {
1691                 dr->dr_status = NFSERR_ACCES;
1692                 return;
1693         }
1694 
1695         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1696         if (dvp == NULL) {
1697                 dr->dr_status = NFSERR_STALE;
1698                 return;
1699         }
1700 
1701         error = sattr_to_vattr(args->ca_sa, &va);
1702         if (error) {
1703                 dr->dr_status = puterrno(error);
1704                 return;
1705         }
1706 
1707         /*
1708          * Must specify the mode.
1709          */
1710         if (!(va.va_mask & AT_MODE)) {
1711                 VN_RELE(dvp);
1712                 dr->dr_status = NFSERR_INVAL;
1713                 return;
1714         }
1715 
1716         /*
1717          * This is a completely gross hack to make mknod
1718          * work over the wire until we can wack the protocol
1719          */
1720         if ((va.va_mode & IFMT) == IFCHR) {
1721                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1722                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1723                 else {
1724                         va.va_type = VCHR;
1725                         /*
1726                          * uncompress the received dev_t
1727                          * if the top half is zero indicating a request
1728                          * from an `older style' OS.
1729                          */
1730                         if ((va.va_size & 0xffff0000) == 0)
1731                                 va.va_rdev = nfsv2_expdev(va.va_size);
1732                         else
1733                                 va.va_rdev = (dev_t)va.va_size;
1734                 }
1735                 va.va_mask &= ~AT_SIZE;
1736         } else if ((va.va_mode & IFMT) == IFBLK) {
1737                 va.va_type = VBLK;
1738                 /*
1739                  * uncompress the received dev_t
1740                  * if the top half is zero indicating a request
1741                  * from an `older style' OS.
1742                  */
1743                 if ((va.va_size & 0xffff0000) == 0)
1744                         va.va_rdev = nfsv2_expdev(va.va_size);
1745                 else
1746                         va.va_rdev = (dev_t)va.va_size;
1747                 va.va_mask &= ~AT_SIZE;
1748         } else if ((va.va_mode & IFMT) == IFSOCK) {
1749                 va.va_type = VSOCK;
1750         } else {
1751                 va.va_type = VREG;
1752         }
1753         va.va_mode &= ~IFMT;
1754         va.va_mask |= AT_TYPE;
1755 
1756         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1757         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1758             MAXPATHLEN);
1759         if (name == NULL) {
1760                 dr->dr_status = puterrno(EINVAL);
1761                 return;
1762         }
1763 
1764         /*
1765          * Why was the choice made to use VWRITE as the mode to the
1766          * call to VOP_CREATE ? This results in a bug.  When a client
1767          * opens a file that already exists and is RDONLY, the second
1768          * open fails with an EACESS because of the mode.
1769          * bug ID 1054648.
1770          */
1771         lookup_ok = 0;
1772         mode = VWRITE;
1773         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1774                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1775                     NULL, NULL, NULL);
1776                 if (!error) {
1777                         struct vattr at;
1778 
1779                         lookup_ok = 1;
1780                         at.va_mask = AT_MODE;
1781                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1782                         if (!error)
1783                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1784                         VN_RELE(tvp);
1785                         tvp = NULL;
1786                 }
1787         }
1788 
1789         if (!lookup_ok) {
1790                 if (rdonly(exi, dvp, req)) {
1791                         error = EROFS;
1792                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1793                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1794                         error = EPERM;
1795                 } else {
1796                         error = 0;
1797                 }
1798         }
1799 
1800         /*
1801          * If file size is being modified on an already existing file
1802          * make sure that there are no conflicting non-blocking mandatory
1803          * locks in the region being manipulated. Return EACCES if there
1804          * are conflicting locks.
1805          */
1806         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1807                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1808                     NULL, NULL, NULL);
1809 
1810                 if (!lookuperr &&
1811                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1812                         VN_RELE(tvp);
1813                         curthread->t_flag |= T_WOULDBLOCK;
1814                         goto out;
1815                 }
1816 
1817                 if (!lookuperr && nbl_need_check(tvp)) {
1818                         /*
1819                          * The file exists. Now check if it has any
1820                          * conflicting non-blocking mandatory locks
1821                          * in the region being changed.
1822                          */
1823                         struct vattr bva;
1824                         u_offset_t offset;
1825                         ssize_t length;
1826 
1827                         nbl_start_crit(tvp, RW_READER);
1828                         in_crit = 1;
1829 
1830                         bva.va_mask = AT_SIZE;
1831                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1832                         if (!error) {
1833                                 if (va.va_size < bva.va_size) {
1834                                         offset = va.va_size;
1835                                         length = bva.va_size - va.va_size;
1836                                 } else {
1837                                         offset = bva.va_size;
1838                                         length = va.va_size - bva.va_size;
1839                                 }
1840                                 if (length) {
1841                                         if (nbl_conflict(tvp, NBL_WRITE,
1842                                             offset, length, 0, NULL)) {
1843                                                 error = EACCES;
1844                                         }
1845                                 }
1846                         }
1847                         if (error) {
1848                                 nbl_end_crit(tvp);
1849                                 VN_RELE(tvp);
1850                                 in_crit = 0;
1851                         }
1852                 } else if (tvp != NULL) {
1853                         VN_RELE(tvp);
1854                 }
1855         }
1856 
1857         if (!error) {
1858                 /*
1859                  * If filesystem is shared with nosuid the remove any
1860                  * setuid/setgid bits on create.
1861                  */
1862                 if (va.va_type == VREG &&
1863                     exi->exi_export.ex_flags & EX_NOSUID)
1864                         va.va_mode &= ~(VSUID | VSGID);
1865 
1866                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1867                     NULL, NULL);
1868 
1869                 if (!error) {
1870 
1871                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1872                                 trunc = TRUE;
1873                         else
1874                                 trunc = FALSE;
1875 
1876                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1877                                 VN_RELE(vp);
1878                                 curthread->t_flag |= T_WOULDBLOCK;
1879                                 goto out;
1880                         }
1881                         va.va_mask = AT_ALL;
1882 
1883                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1884 
1885                         /* check for overflows */
1886                         if (!error) {
1887                                 acl_perm(vp, exi, &va, cr);
1888                                 error = vattr_to_nattr(&va, &dr->dr_attr);
1889                                 if (!error) {
1890                                         error = makefh(&dr->dr_fhandle, vp,
1891                                             exi);
1892                                 }
1893                         }
1894                         /*
1895                          * Force modified metadata out to stable storage.
1896                          *
1897                          * if a underlying vp exists, pass it to VOP_FSYNC
1898                          */
1899                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
1900                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1901                         else
1902                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1903                         VN_RELE(vp);
1904                 }
1905 
1906                 if (in_crit) {
1907                         nbl_end_crit(tvp);
1908                         VN_RELE(tvp);
1909                 }
1910         }
1911 
1912         /*
1913          * Force modified data and metadata out to stable storage.
1914          */
1915         (void) VOP_FSYNC(dvp, 0, cr, NULL);
1916 
1917 out:
1918 
1919         VN_RELE(dvp);
1920 
1921         dr->dr_status = puterrno(error);
1922 
1923         if (name != args->ca_da.da_name)
1924                 kmem_free(name, MAXPATHLEN);
1925 }
1926 void *
1927 rfs_create_getfh(struct nfscreatargs *args)
1928 {
1929         return (args->ca_da.da_fhandle);
1930 }
1931 
1932 /*
1933  * Remove a file.
1934  * Remove named file from parent directory.
1935  */
1936 void
1937 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1938         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1939 {
1940         int error = 0;
1941         vnode_t *vp;
1942         vnode_t *targvp;
1943         int in_crit = 0;
1944 
1945         /*
1946          * Disallow NULL paths
1947          */
1948         if (da->da_name == NULL || *da->da_name == '\0') {
1949                 *status = NFSERR_ACCES;
1950                 return;
1951         }
1952 
1953         vp = nfs_fhtovp(da->da_fhandle, exi);
1954         if (vp == NULL) {
1955                 *status = NFSERR_STALE;
1956                 return;
1957         }
1958 
1959         if (rdonly(exi, vp, req)) {
1960                 VN_RELE(vp);
1961                 *status = NFSERR_ROFS;
1962                 return;
1963         }
1964 
1965         /*
1966          * Check for a conflict with a non-blocking mandatory share reservation.
1967          */
1968         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1969             NULL, cr, NULL, NULL, NULL);
1970         if (error != 0) {
1971                 VN_RELE(vp);
1972                 *status = puterrno(error);
1973                 return;
1974         }
1975 
1976         /*
1977          * If the file is delegated to an v4 client, then initiate
1978          * recall and drop this request (by setting T_WOULDBLOCK).
1979          * The client will eventually re-transmit the request and
1980          * (hopefully), by then, the v4 client will have returned
1981          * the delegation.
1982          */
1983 
1984         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1985                 VN_RELE(vp);
1986                 VN_RELE(targvp);
1987                 curthread->t_flag |= T_WOULDBLOCK;
1988                 return;
1989         }
1990 
1991         if (nbl_need_check(targvp)) {
1992                 nbl_start_crit(targvp, RW_READER);
1993                 in_crit = 1;
1994                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1995                         error = EACCES;
1996                         goto out;
1997                 }
1998         }
1999 
2000         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2001 
2002         /*
2003          * Force modified data and metadata out to stable storage.
2004          */
2005         (void) VOP_FSYNC(vp, 0, cr, NULL);
2006 
2007 out:
2008         if (in_crit)
2009                 nbl_end_crit(targvp);
2010         VN_RELE(targvp);
2011         VN_RELE(vp);
2012 
2013         *status = puterrno(error);
2014 
2015 }
2016 
2017 void *
2018 rfs_remove_getfh(struct nfsdiropargs *da)
2019 {
2020         return (da->da_fhandle);
2021 }
2022 
2023 /*
2024  * rename a file
2025  * Give a file (from) a new name (to).
2026  */
2027 void
2028 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2029         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2030 {
2031         int error = 0;
2032         vnode_t *fromvp;
2033         vnode_t *tovp;
2034         struct exportinfo *to_exi;
2035         fhandle_t *fh;
2036         vnode_t *srcvp;
2037         vnode_t *targvp;
2038         int in_crit = 0;
2039 
2040         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2041         if (fromvp == NULL) {
2042                 *status = NFSERR_STALE;
2043                 return;
2044         }
2045 
2046         fh = args->rna_to.da_fhandle;
2047         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2048         if (to_exi == NULL) {
2049                 VN_RELE(fromvp);
2050                 *status = NFSERR_ACCES;
2051                 return;
2052         }
2053         exi_rele(to_exi);
2054 
2055         if (to_exi != exi) {
2056                 VN_RELE(fromvp);
2057                 *status = NFSERR_XDEV;
2058                 return;
2059         }
2060 
2061         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2062         if (tovp == NULL) {
2063                 VN_RELE(fromvp);
2064                 *status = NFSERR_STALE;
2065                 return;
2066         }
2067 
2068         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2069                 VN_RELE(tovp);
2070                 VN_RELE(fromvp);
2071                 *status = NFSERR_NOTDIR;
2072                 return;
2073         }
2074 
2075         /*
2076          * Disallow NULL paths
2077          */
2078         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2079             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2080                 VN_RELE(tovp);
2081                 VN_RELE(fromvp);
2082                 *status = NFSERR_ACCES;
2083                 return;
2084         }
2085 
2086         if (rdonly(exi, tovp, req)) {
2087                 VN_RELE(tovp);
2088                 VN_RELE(fromvp);
2089                 *status = NFSERR_ROFS;
2090                 return;
2091         }
2092 
2093         /*
2094          * Check for a conflict with a non-blocking mandatory share reservation.
2095          */
2096         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2097             NULL, cr, NULL, NULL, NULL);
2098         if (error != 0) {
2099                 VN_RELE(tovp);
2100                 VN_RELE(fromvp);
2101                 *status = puterrno(error);
2102                 return;
2103         }
2104 
2105         /* Check for delegations on the source file */
2106 
2107         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2108                 VN_RELE(tovp);
2109                 VN_RELE(fromvp);
2110                 VN_RELE(srcvp);
2111                 curthread->t_flag |= T_WOULDBLOCK;
2112                 return;
2113         }
2114 
2115         /* Check for delegation on the file being renamed over, if it exists */
2116 
2117         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2118             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2119             NULL, NULL, NULL) == 0) {
2120 
2121                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2122                         VN_RELE(tovp);
2123                         VN_RELE(fromvp);
2124                         VN_RELE(srcvp);
2125                         VN_RELE(targvp);
2126                         curthread->t_flag |= T_WOULDBLOCK;
2127                         return;
2128                 }
2129                 VN_RELE(targvp);
2130         }
2131 
2132 
2133         if (nbl_need_check(srcvp)) {
2134                 nbl_start_crit(srcvp, RW_READER);
2135                 in_crit = 1;
2136                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2137                         error = EACCES;
2138                         goto out;
2139                 }
2140         }
2141 
2142         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2143             tovp, args->rna_to.da_name, cr, NULL, 0);
2144 
2145         if (error == 0)
2146                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2147                     strlen(args->rna_to.da_name));
2148 
2149         /*
2150          * Force modified data and metadata out to stable storage.
2151          */
2152         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2153         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2154 
2155 out:
2156         if (in_crit)
2157                 nbl_end_crit(srcvp);
2158         VN_RELE(srcvp);
2159         VN_RELE(tovp);
2160         VN_RELE(fromvp);
2161 
2162         *status = puterrno(error);
2163 
2164 }
2165 void *
2166 rfs_rename_getfh(struct nfsrnmargs *args)
2167 {
2168         return (args->rna_from.da_fhandle);
2169 }
2170 
2171 /*
2172  * Link to a file.
2173  * Create a file (to) which is a hard link to the given file (from).
2174  */
2175 void
2176 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2177         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2178 {
2179         int error;
2180         vnode_t *fromvp;
2181         vnode_t *tovp;
2182         struct exportinfo *to_exi;
2183         fhandle_t *fh;
2184 
2185         fromvp = nfs_fhtovp(args->la_from, exi);
2186         if (fromvp == NULL) {
2187                 *status = NFSERR_STALE;
2188                 return;
2189         }
2190 
2191         fh = args->la_to.da_fhandle;
2192         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2193         if (to_exi == NULL) {
2194                 VN_RELE(fromvp);
2195                 *status = NFSERR_ACCES;
2196                 return;
2197         }
2198         exi_rele(to_exi);
2199 
2200         if (to_exi != exi) {
2201                 VN_RELE(fromvp);
2202                 *status = NFSERR_XDEV;
2203                 return;
2204         }
2205 
2206         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2207         if (tovp == NULL) {
2208                 VN_RELE(fromvp);
2209                 *status = NFSERR_STALE;
2210                 return;
2211         }
2212 
2213         if (tovp->v_type != VDIR) {
2214                 VN_RELE(tovp);
2215                 VN_RELE(fromvp);
2216                 *status = NFSERR_NOTDIR;
2217                 return;
2218         }
2219         /*
2220          * Disallow NULL paths
2221          */
2222         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2223                 VN_RELE(tovp);
2224                 VN_RELE(fromvp);
2225                 *status = NFSERR_ACCES;
2226                 return;
2227         }
2228 
2229         if (rdonly(exi, tovp, req)) {
2230                 VN_RELE(tovp);
2231                 VN_RELE(fromvp);
2232                 *status = NFSERR_ROFS;
2233                 return;
2234         }
2235 
2236         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2237 
2238         /*
2239          * Force modified data and metadata out to stable storage.
2240          */
2241         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2242         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2243 
2244         VN_RELE(tovp);
2245         VN_RELE(fromvp);
2246 
2247         *status = puterrno(error);
2248 
2249 }
2250 void *
2251 rfs_link_getfh(struct nfslinkargs *args)
2252 {
2253         return (args->la_from);
2254 }
2255 
2256 /*
2257  * Symbolicly link to a file.
2258  * Create a file (to) with the given attributes which is a symbolic link
2259  * to the given path name (to).
2260  */
2261 void
2262 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2263         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2264 {
2265         int error;
2266         struct vattr va;
2267         vnode_t *vp;
2268         vnode_t *svp;
2269         int lerror;
2270         struct sockaddr *ca;
2271         char *name = NULL;
2272 
2273         /*
2274          * Disallow NULL paths
2275          */
2276         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2277                 *status = NFSERR_ACCES;
2278                 return;
2279         }
2280 
2281         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2282         if (vp == NULL) {
2283                 *status = NFSERR_STALE;
2284                 return;
2285         }
2286 
2287         if (rdonly(exi, vp, req)) {
2288                 VN_RELE(vp);
2289                 *status = NFSERR_ROFS;
2290                 return;
2291         }
2292 
2293         error = sattr_to_vattr(args->sla_sa, &va);
2294         if (error) {
2295                 VN_RELE(vp);
2296                 *status = puterrno(error);
2297                 return;
2298         }
2299 
2300         if (!(va.va_mask & AT_MODE)) {
2301                 VN_RELE(vp);
2302                 *status = NFSERR_INVAL;
2303                 return;
2304         }
2305 
2306         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2307         name = nfscmd_convname(ca, exi, args->sla_tnm,
2308             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2309 
2310         if (name == NULL) {
2311                 *status = NFSERR_ACCES;
2312                 return;
2313         }
2314 
2315         va.va_type = VLNK;
2316         va.va_mask |= AT_TYPE;
2317 
2318         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2319 
2320         /*
2321          * Force new data and metadata out to stable storage.
2322          */
2323         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2324             NULL, cr, NULL, NULL, NULL);
2325 
2326         if (!lerror) {
2327                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2328                 VN_RELE(svp);
2329         }
2330 
2331         /*
2332          * Force modified data and metadata out to stable storage.
2333          */
2334         (void) VOP_FSYNC(vp, 0, cr, NULL);
2335 
2336         VN_RELE(vp);
2337 
2338         *status = puterrno(error);
2339         if (name != args->sla_tnm)
2340                 kmem_free(name, MAXPATHLEN);
2341 
2342 }
2343 void *
2344 rfs_symlink_getfh(struct nfsslargs *args)
2345 {
2346         return (args->sla_from.da_fhandle);
2347 }
2348 
2349 /*
2350  * Make a directory.
2351  * Create a directory with the given name, parent directory, and attributes.
2352  * Returns a file handle and attributes for the new directory.
2353  */
2354 void
2355 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2356         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2357 {
2358         int error;
2359         struct vattr va;
2360         vnode_t *dvp = NULL;
2361         vnode_t *vp;
2362         char *name = args->ca_da.da_name;
2363 
2364         /*
2365          * Disallow NULL paths
2366          */
2367         if (name == NULL || *name == '\0') {
2368                 dr->dr_status = NFSERR_ACCES;
2369                 return;
2370         }
2371 
2372         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2373         if (vp == NULL) {
2374                 dr->dr_status = NFSERR_STALE;
2375                 return;
2376         }
2377 
2378         if (rdonly(exi, vp, req)) {
2379                 VN_RELE(vp);
2380                 dr->dr_status = NFSERR_ROFS;
2381                 return;
2382         }
2383 
2384         error = sattr_to_vattr(args->ca_sa, &va);
2385         if (error) {
2386                 VN_RELE(vp);
2387                 dr->dr_status = puterrno(error);
2388                 return;
2389         }
2390 
2391         if (!(va.va_mask & AT_MODE)) {
2392                 VN_RELE(vp);
2393                 dr->dr_status = NFSERR_INVAL;
2394                 return;
2395         }
2396 
2397         va.va_type = VDIR;
2398         va.va_mask |= AT_TYPE;
2399 
2400         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2401 
2402         if (!error) {
2403                 /*
2404                  * Attribtutes of the newly created directory should
2405                  * be returned to the client.
2406                  */
2407                 va.va_mask = AT_ALL; /* We want everything */
2408                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2409 
2410                 /* check for overflows */
2411                 if (!error) {
2412                         acl_perm(vp, exi, &va, cr);
2413                         error = vattr_to_nattr(&va, &dr->dr_attr);
2414                         if (!error) {
2415                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2416                         }
2417                 }
2418                 /*
2419                  * Force new data and metadata out to stable storage.
2420                  */
2421                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2422                 VN_RELE(dvp);
2423         }
2424 
2425         /*
2426          * Force modified data and metadata out to stable storage.
2427          */
2428         (void) VOP_FSYNC(vp, 0, cr, NULL);
2429 
2430         VN_RELE(vp);
2431 
2432         dr->dr_status = puterrno(error);
2433 
2434 }
2435 void *
2436 rfs_mkdir_getfh(struct nfscreatargs *args)
2437 {
2438         return (args->ca_da.da_fhandle);
2439 }
2440 
2441 /*
2442  * Remove a directory.
2443  * Remove the given directory name from the given parent directory.
2444  */
2445 void
2446 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2447         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2448 {
2449         int error;
2450         vnode_t *vp;
2451 
2452         /*
2453          * Disallow NULL paths
2454          */
2455         if (da->da_name == NULL || *da->da_name == '\0') {
2456                 *status = NFSERR_ACCES;
2457                 return;
2458         }
2459 
2460         vp = nfs_fhtovp(da->da_fhandle, exi);
2461         if (vp == NULL) {
2462                 *status = NFSERR_STALE;
2463                 return;
2464         }
2465 
2466         if (rdonly(exi, vp, req)) {
2467                 VN_RELE(vp);
2468                 *status = NFSERR_ROFS;
2469                 return;
2470         }
2471 
2472         /*
2473          * VOP_RMDIR takes a third argument (the current
2474          * directory of the process).  That's because someone
2475          * wants to return EINVAL if one tries to remove ".".
2476          * Of course, NFS servers have no idea what their
2477          * clients' current directories are.  We fake it by
2478          * supplying a vnode known to exist and illegal to
2479          * remove.
2480          */
2481         error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2482 
2483         /*
2484          * Force modified data and metadata out to stable storage.
2485          */
2486         (void) VOP_FSYNC(vp, 0, cr, NULL);
2487 
2488         VN_RELE(vp);
2489 
2490         /*
2491          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2492          * if the directory is not empty.  A System V NFS server
2493          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2494          * over the wire.
2495          */
2496         if (error == EEXIST)
2497                 *status = NFSERR_NOTEMPTY;
2498         else
2499                 *status = puterrno(error);
2500 
2501 }
2502 void *
2503 rfs_rmdir_getfh(struct nfsdiropargs *da)
2504 {
2505         return (da->da_fhandle);
2506 }
2507 
2508 /* ARGSUSED */
2509 void
2510 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2511         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2512 {
2513         int error;
2514         int iseof;
2515         struct iovec iov;
2516         struct uio uio;
2517         vnode_t *vp;
2518         char *ndata = NULL;
2519         struct sockaddr *ca;
2520         size_t nents;
2521         int ret;
2522 
2523         vp = nfs_fhtovp(&rda->rda_fh, exi);
2524         if (vp == NULL) {
2525                 rd->rd_entries = NULL;
2526                 rd->rd_status = NFSERR_STALE;
2527                 return;
2528         }
2529 
2530         if (vp->v_type != VDIR) {
2531                 VN_RELE(vp);
2532                 rd->rd_entries = NULL;
2533                 rd->rd_status = NFSERR_NOTDIR;
2534                 return;
2535         }
2536 
2537         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2538 
2539         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2540 
2541         if (error) {
2542                 rd->rd_entries = NULL;
2543                 goto bad;
2544         }
2545 
2546         if (rda->rda_count == 0) {
2547                 rd->rd_entries = NULL;
2548                 rd->rd_size = 0;
2549                 rd->rd_eof = FALSE;
2550                 goto bad;
2551         }
2552 
2553         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2554 
2555         /*
2556          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2557          */
2558         rd->rd_bufsize = (uint_t)rda->rda_count;
2559         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2560 
2561         /*
2562          * Set up io vector to read directory data
2563          */
2564         iov.iov_base = (caddr_t)rd->rd_entries;
2565         iov.iov_len = rda->rda_count;
2566         uio.uio_iov = &iov;
2567         uio.uio_iovcnt = 1;
2568         uio.uio_segflg = UIO_SYSSPACE;
2569         uio.uio_extflg = UIO_COPY_CACHED;
2570         uio.uio_loffset = (offset_t)rda->rda_offset;
2571         uio.uio_resid = rda->rda_count;
2572 
2573         /*
2574          * read directory
2575          */
2576         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2577 
2578         /*
2579          * Clean up
2580          */
2581         if (!error) {
2582                 /*
2583                  * set size and eof
2584                  */
2585                 if (uio.uio_resid == rda->rda_count) {
2586                         rd->rd_size = 0;
2587                         rd->rd_eof = TRUE;
2588                 } else {
2589                         rd->rd_size = (uint32_t)(rda->rda_count -
2590                             uio.uio_resid);
2591                         rd->rd_eof = iseof ? TRUE : FALSE;
2592                 }
2593         }
2594 
2595         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2596         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2597         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2598             rda->rda_count, &ndata);
2599 
2600         if (ret != 0) {
2601                 size_t dropbytes;
2602                 /*
2603                  * We had to drop one or more entries in order to fit
2604                  * during the character conversion.  We need to patch
2605                  * up the size and eof info.
2606                  */
2607                 if (rd->rd_eof)
2608                         rd->rd_eof = FALSE;
2609                 dropbytes = nfscmd_dropped_entrysize(
2610                     (struct dirent64 *)rd->rd_entries, nents, ret);
2611                 rd->rd_size -= dropbytes;
2612         }
2613         if (ndata == NULL) {
2614                 ndata = (char *)rd->rd_entries;
2615         } else if (ndata != (char *)rd->rd_entries) {
2616                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2617                 rd->rd_entries = (void *)ndata;
2618                 rd->rd_bufsize = rda->rda_count;
2619         }
2620 
2621 bad:
2622         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2623 
2624 #if 0 /* notyet */
2625         /*
2626          * Don't do this.  It causes local disk writes when just
2627          * reading the file and the overhead is deemed larger
2628          * than the benefit.
2629          */
2630         /*
2631          * Force modified metadata out to stable storage.
2632          */
2633         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2634 #endif
2635 
2636         VN_RELE(vp);
2637 
2638         rd->rd_status = puterrno(error);
2639 
2640 }
2641 void *
2642 rfs_readdir_getfh(struct nfsrddirargs *rda)
2643 {
2644         return (&rda->rda_fh);
2645 }
2646 void
2647 rfs_rddirfree(struct nfsrddirres *rd)
2648 {
2649         if (rd->rd_entries != NULL)
2650                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2651 }
2652 
2653 /* ARGSUSED */
2654 void
2655 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2656         struct svc_req *req, cred_t *cr)
2657 {
2658         int error;
2659         struct statvfs64 sb;
2660         vnode_t *vp;
2661 
2662         vp = nfs_fhtovp(fh, exi);
2663         if (vp == NULL) {
2664                 fs->fs_status = NFSERR_STALE;
2665                 return;
2666         }
2667 
2668         error = VFS_STATVFS(vp->v_vfsp, &sb);
2669 
2670         if (!error) {
2671                 fs->fs_tsize = nfstsize();
2672                 fs->fs_bsize = sb.f_frsize;
2673                 fs->fs_blocks = sb.f_blocks;
2674                 fs->fs_bfree = sb.f_bfree;
2675                 fs->fs_bavail = sb.f_bavail;
2676         }
2677 
2678         VN_RELE(vp);
2679 
2680         fs->fs_status = puterrno(error);
2681 
2682 }
2683 void *
2684 rfs_statfs_getfh(fhandle_t *fh)
2685 {
2686         return (fh);
2687 }
2688 
2689 static int
2690 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2691 {
2692         vap->va_mask = 0;
2693 
2694         /*
2695          * There was a sign extension bug in some VFS based systems
2696          * which stored the mode as a short.  When it would get
2697          * assigned to a u_long, no sign extension would occur.
2698          * It needed to, but this wasn't noticed because sa_mode
2699          * would then get assigned back to the short, thus ignoring
2700          * the upper 16 bits of sa_mode.
2701          *
2702          * To make this implementation work for both broken
2703          * clients and good clients, we check for both versions
2704          * of the mode.
2705          */
2706         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2707             sa->sa_mode != (uint32_t)-1) {
2708                 vap->va_mask |= AT_MODE;
2709                 vap->va_mode = sa->sa_mode;
2710         }
2711         if (sa->sa_uid != (uint32_t)-1) {
2712                 vap->va_mask |= AT_UID;
2713                 vap->va_uid = sa->sa_uid;
2714         }
2715         if (sa->sa_gid != (uint32_t)-1) {
2716                 vap->va_mask |= AT_GID;
2717                 vap->va_gid = sa->sa_gid;
2718         }
2719         if (sa->sa_size != (uint32_t)-1) {
2720                 vap->va_mask |= AT_SIZE;
2721                 vap->va_size = sa->sa_size;
2722         }
2723         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2724             sa->sa_atime.tv_usec != (int32_t)-1) {
2725 #ifndef _LP64
2726                 /* return error if time overflow */
2727                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2728                         return (EOVERFLOW);
2729 #endif
2730                 vap->va_mask |= AT_ATIME;
2731                 /*
2732                  * nfs protocol defines times as unsigned so don't extend sign,
2733                  * unless sysadmin set nfs_allow_preepoch_time.
2734                  */
2735                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2736                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2737         }
2738         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2739             sa->sa_mtime.tv_usec != (int32_t)-1) {
2740 #ifndef _LP64
2741                 /* return error if time overflow */
2742                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2743                         return (EOVERFLOW);
2744 #endif
2745                 vap->va_mask |= AT_MTIME;
2746                 /*
2747                  * nfs protocol defines times as unsigned so don't extend sign,
2748                  * unless sysadmin set nfs_allow_preepoch_time.
2749                  */
2750                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2751                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2752         }
2753         return (0);
2754 }
2755 
2756 static enum nfsftype vt_to_nf[] = {
2757         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2758 };
2759 
2760 /*
2761  * check the following fields for overflow: nodeid, size, and time.
2762  * There could be a problem when converting 64-bit LP64 fields
2763  * into 32-bit ones.  Return an error if there is an overflow.
2764  */
2765 int
2766 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2767 {
2768         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2769         na->na_type = vt_to_nf[vap->va_type];
2770 
2771         if (vap->va_mode == (unsigned short) -1)
2772                 na->na_mode = (uint32_t)-1;
2773         else
2774                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2775 
2776         if (vap->va_uid == (unsigned short)(-1))
2777                 na->na_uid = (uint32_t)(-1);
2778         else if (vap->va_uid == UID_NOBODY)
2779                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2780         else
2781                 na->na_uid = vap->va_uid;
2782 
2783         if (vap->va_gid == (unsigned short)(-1))
2784                 na->na_gid = (uint32_t)-1;
2785         else if (vap->va_gid == GID_NOBODY)
2786                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2787         else
2788                 na->na_gid = vap->va_gid;
2789 
2790         /*
2791          * Do we need to check fsid for overflow?  It is 64-bit in the
2792          * vattr, but are bigger than 32 bit values supported?
2793          */
2794         na->na_fsid = vap->va_fsid;
2795 
2796         na->na_nodeid = vap->va_nodeid;
2797 
2798         /*
2799          * Check to make sure that the nodeid is representable over the
2800          * wire without losing bits.
2801          */
2802         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2803                 return (EFBIG);
2804         na->na_nlink = vap->va_nlink;
2805 
2806         /*
2807          * Check for big files here, instead of at the caller.  See
2808          * comments in cstat for large special file explanation.
2809          */
2810         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2811                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2812                         return (EFBIG);
2813                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2814                         /* UNKNOWN_SIZE | OVERFLOW */
2815                         na->na_size = MAXOFF32_T;
2816                 } else
2817                         na->na_size = vap->va_size;
2818         } else
2819                 na->na_size = vap->va_size;
2820 
2821         /*
2822          * If the vnode times overflow the 32-bit times that NFS2
2823          * uses on the wire then return an error.
2824          */
2825         if (!NFS_VAP_TIME_OK(vap)) {
2826                 return (EOVERFLOW);
2827         }
2828         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2829         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2830 
2831         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2832         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2833 
2834         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2835         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2836 
2837         /*
2838          * If the dev_t will fit into 16 bits then compress
2839          * it, otherwise leave it alone. See comments in
2840          * nfs_client.c.
2841          */
2842         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2843             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2844                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2845         else
2846                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2847 
2848         na->na_blocks = vap->va_nblocks;
2849         na->na_blocksize = vap->va_blksize;
2850 
2851         /*
2852          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2853          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2854          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2855          *
2856          * BUYER BEWARE:
2857          *  If you are porting the NFS to a non-Sun server, you probably
2858          *  don't want to include the following block of code.  The
2859          *  over-the-wire special file types will be changing with the
2860          *  NFS Protocol Revision.
2861          */
2862         if (vap->va_type == VFIFO)
2863                 NA_SETFIFO(na);
2864         return (0);
2865 }
2866 
2867 /*
2868  * acl v2 support: returns approximate permission.
2869  *      default: returns minimal permission (more restrictive)
2870  *      aclok: returns maximal permission (less restrictive)
2871  *      This routine changes the permissions that are alaredy in *va.
2872  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2873  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
2874  */
2875 static void
2876 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2877 {
2878         vsecattr_t      vsa;
2879         int             aclcnt;
2880         aclent_t        *aclentp;
2881         mode_t          mask_perm;
2882         mode_t          grp_perm;
2883         mode_t          other_perm;
2884         mode_t          other_orig;
2885         int             error;
2886 
2887         /* dont care default acl */
2888         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2889         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2890 
2891         if (!error) {
2892                 aclcnt = vsa.vsa_aclcnt;
2893                 if (aclcnt > MIN_ACL_ENTRIES) {
2894                         /* non-trivial ACL */
2895                         aclentp = vsa.vsa_aclentp;
2896                         if (exi->exi_export.ex_flags & EX_ACLOK) {
2897                                 /* maximal permissions */
2898                                 grp_perm = 0;
2899                                 other_perm = 0;
2900                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2901                                         switch (aclentp->a_type) {
2902                                         case USER_OBJ:
2903                                                 break;
2904                                         case USER:
2905                                                 grp_perm |=
2906                                                     aclentp->a_perm << 3;
2907                                                 other_perm |= aclentp->a_perm;
2908                                                 break;
2909                                         case GROUP_OBJ:
2910                                                 grp_perm |=
2911                                                     aclentp->a_perm << 3;
2912                                                 break;
2913                                         case GROUP:
2914                                                 other_perm |= aclentp->a_perm;
2915                                                 break;
2916                                         case OTHER_OBJ:
2917                                                 other_orig = aclentp->a_perm;
2918                                                 break;
2919                                         case CLASS_OBJ:
2920                                                 mask_perm = aclentp->a_perm;
2921                                                 break;
2922                                         default:
2923                                                 break;
2924                                         }
2925                                 }
2926                                 grp_perm &= mask_perm << 3;
2927                                 other_perm &= mask_perm;
2928                                 other_perm |= other_orig;
2929 
2930                         } else {
2931                                 /* minimal permissions */
2932                                 grp_perm = 070;
2933                                 other_perm = 07;
2934                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2935                                         switch (aclentp->a_type) {
2936                                         case USER_OBJ:
2937                                                 break;
2938                                         case USER:
2939                                         case CLASS_OBJ:
2940                                                 grp_perm &=
2941                                                     aclentp->a_perm << 3;
2942                                                 other_perm &=
2943                                                     aclentp->a_perm;
2944                                                 break;
2945                                         case GROUP_OBJ:
2946                                                 grp_perm &=
2947                                                     aclentp->a_perm << 3;
2948                                                 break;
2949                                         case GROUP:
2950                                                 other_perm &=
2951                                                     aclentp->a_perm;
2952                                                 break;
2953                                         case OTHER_OBJ:
2954                                                 other_perm &=
2955                                                     aclentp->a_perm;
2956                                                 break;
2957                                         default:
2958                                                 break;
2959                                         }
2960                                 }
2961                         }
2962                         /* copy to va */
2963                         va->va_mode &= ~077;
2964                         va->va_mode |= grp_perm | other_perm;
2965                 }
2966                 if (vsa.vsa_aclcnt)
2967                         kmem_free(vsa.vsa_aclentp,
2968                             vsa.vsa_aclcnt * sizeof (aclent_t));
2969         }
2970 }
2971 
2972 void
2973 rfs_srvrinit(void)
2974 {
2975         mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2976         nfs2_srv_caller_id = fs_new_caller_id();
2977 }
2978 
2979 void
2980 rfs_srvrfini(void)
2981 {
2982         mutex_destroy(&rfs_async_write_lock);
2983 }
2984 
2985 static int
2986 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2987 {
2988         struct clist    *wcl;
2989         int             wlist_len;
2990         uint32_t        count = rr->rr_count;
2991 
2992         wcl = ra->ra_wlist;
2993 
2994         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
2995                 return (FALSE);
2996         }
2997 
2998         wcl = ra->ra_wlist;
2999         rr->rr_ok.rrok_wlist_len = wlist_len;
3000         rr->rr_ok.rrok_wlist = wcl;
3001 
3002         return (TRUE);
3003 }