Print this page
patch remove-dont-swap-flag
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs_srv.c
+++ new/usr/src/uts/common/fs/nfs/nfs_srv.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28 28 * All rights reserved.
29 29 */
30 30
31 31 #include <sys/param.h>
32 32 #include <sys/types.h>
33 33 #include <sys/systm.h>
34 34 #include <sys/cred.h>
35 35 #include <sys/buf.h>
36 36 #include <sys/vfs.h>
37 37 #include <sys/vnode.h>
38 38 #include <sys/uio.h>
39 39 #include <sys/stat.h>
40 40 #include <sys/errno.h>
41 41 #include <sys/sysmacros.h>
42 42 #include <sys/statvfs.h>
43 43 #include <sys/kmem.h>
44 44 #include <sys/kstat.h>
45 45 #include <sys/dirent.h>
46 46 #include <sys/cmn_err.h>
47 47 #include <sys/debug.h>
48 48 #include <sys/vtrace.h>
49 49 #include <sys/mode.h>
50 50 #include <sys/acl.h>
51 51 #include <sys/nbmlock.h>
52 52 #include <sys/policy.h>
53 53 #include <sys/sdt.h>
54 54
55 55 #include <rpc/types.h>
56 56 #include <rpc/auth.h>
57 57 #include <rpc/svc.h>
58 58
59 59 #include <nfs/nfs.h>
60 60 #include <nfs/export.h>
61 61 #include <nfs/nfs_cmd.h>
62 62
63 63 #include <vm/hat.h>
64 64 #include <vm/as.h>
65 65 #include <vm/seg.h>
66 66 #include <vm/seg_map.h>
67 67 #include <vm/seg_kmem.h>
68 68
69 69 #include <sys/strsubr.h>
70 70
71 71 /*
72 72 * These are the interface routines for the server side of the
73 73 * Network File System. See the NFS version 2 protocol specification
74 74 * for a description of this interface.
75 75 */
76 76
77 77 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
78 78 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 79 cred_t *);
80 80
81 81 /*
82 82 * Some "over the wire" UNIX file types. These are encoded
83 83 * into the mode. This needs to be fixed in the next rev.
84 84 */
85 85 #define IFMT 0170000 /* type of file */
86 86 #define IFCHR 0020000 /* character special */
87 87 #define IFBLK 0060000 /* block special */
88 88 #define IFSOCK 0140000 /* socket */
89 89
90 90 u_longlong_t nfs2_srv_caller_id;
91 91
92 92 /*
93 93 * Get file attributes.
94 94 * Returns the current attributes of the file with the given fhandle.
95 95 */
96 96 /* ARGSUSED */
97 97 void
98 98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99 99 struct svc_req *req, cred_t *cr)
100 100 {
101 101 int error;
102 102 vnode_t *vp;
103 103 struct vattr va;
104 104
105 105 vp = nfs_fhtovp(fhp, exi);
106 106 if (vp == NULL) {
107 107 ns->ns_status = NFSERR_STALE;
108 108 return;
109 109 }
110 110
111 111 /*
112 112 * Do the getattr.
113 113 */
114 114 va.va_mask = AT_ALL; /* we want all the attributes */
115 115
116 116 error = rfs4_delegated_getattr(vp, &va, 0, cr);
117 117
118 118 /* check for overflows */
119 119 if (!error) {
120 120 /* Lie about the object type for a referral */
121 121 if (vn_is_nfs_reparse(vp, cr))
122 122 va.va_type = VLNK;
123 123
124 124 acl_perm(vp, exi, &va, cr);
125 125 error = vattr_to_nattr(&va, &ns->ns_attr);
126 126 }
127 127
128 128 VN_RELE(vp);
129 129
130 130 ns->ns_status = puterrno(error);
131 131 }
132 132 void *
133 133 rfs_getattr_getfh(fhandle_t *fhp)
134 134 {
135 135 return (fhp);
136 136 }
137 137
138 138 /*
139 139 * Set file attributes.
140 140 * Sets the attributes of the file with the given fhandle. Returns
141 141 * the new attributes.
142 142 */
143 143 void
144 144 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
145 145 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
146 146 {
147 147 int error;
148 148 int flag;
149 149 int in_crit = 0;
150 150 vnode_t *vp;
151 151 struct vattr va;
152 152 struct vattr bva;
153 153 struct flock64 bf;
154 154 caller_context_t ct;
155 155
156 156
157 157 vp = nfs_fhtovp(&args->saa_fh, exi);
158 158 if (vp == NULL) {
159 159 ns->ns_status = NFSERR_STALE;
160 160 return;
161 161 }
162 162
163 163 if (rdonly(exi, vp, req)) {
164 164 VN_RELE(vp);
165 165 ns->ns_status = NFSERR_ROFS;
166 166 return;
167 167 }
168 168
169 169 error = sattr_to_vattr(&args->saa_sa, &va);
170 170 if (error) {
171 171 VN_RELE(vp);
172 172 ns->ns_status = puterrno(error);
173 173 return;
174 174 }
175 175
176 176 /*
177 177 * If the client is requesting a change to the mtime,
178 178 * but the nanosecond field is set to 1 billion, then
179 179 * this is a flag to the server that it should set the
180 180 * atime and mtime fields to the server's current time.
181 181 * The 1 billion number actually came from the client
182 182 * as 1 million, but the units in the over the wire
183 183 * request are microseconds instead of nanoseconds.
184 184 *
185 185 * This is an overload of the protocol and should be
186 186 * documented in the NFS Version 2 protocol specification.
187 187 */
188 188 if (va.va_mask & AT_MTIME) {
189 189 if (va.va_mtime.tv_nsec == 1000000000) {
190 190 gethrestime(&va.va_mtime);
191 191 va.va_atime = va.va_mtime;
192 192 va.va_mask |= AT_ATIME;
193 193 flag = 0;
194 194 } else
195 195 flag = ATTR_UTIME;
196 196 } else
197 197 flag = 0;
198 198
199 199 /*
200 200 * If the filesystem is exported with nosuid, then mask off
201 201 * the setuid and setgid bits.
202 202 */
203 203 if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
204 204 (exi->exi_export.ex_flags & EX_NOSUID))
205 205 va.va_mode &= ~(VSUID | VSGID);
206 206
207 207 ct.cc_sysid = 0;
208 208 ct.cc_pid = 0;
209 209 ct.cc_caller_id = nfs2_srv_caller_id;
210 210 ct.cc_flags = CC_DONTBLOCK;
211 211
212 212 /*
213 213 * We need to specially handle size changes because it is
214 214 * possible for the client to create a file with modes
215 215 * which indicate read-only, but with the file opened for
216 216 * writing. If the client then tries to set the size of
217 217 * the file, then the normal access checking done in
218 218 * VOP_SETATTR would prevent the client from doing so,
219 219 * although it should be legal for it to do so. To get
220 220 * around this, we do the access checking for ourselves
221 221 * and then use VOP_SPACE which doesn't do the access
222 222 * checking which VOP_SETATTR does. VOP_SPACE can only
223 223 * operate on VREG files, let VOP_SETATTR handle the other
224 224 * extremely rare cases.
225 225 * Also the client should not be allowed to change the
226 226 * size of the file if there is a conflicting non-blocking
227 227 * mandatory lock in the region of change.
228 228 */
229 229 if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
230 230 if (nbl_need_check(vp)) {
231 231 nbl_start_crit(vp, RW_READER);
232 232 in_crit = 1;
233 233 }
234 234
235 235 bva.va_mask = AT_UID | AT_SIZE;
236 236
237 237 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
238 238
239 239 if (error) {
240 240 if (in_crit)
241 241 nbl_end_crit(vp);
242 242 VN_RELE(vp);
243 243 ns->ns_status = puterrno(error);
244 244 return;
245 245 }
246 246
247 247 if (in_crit) {
248 248 u_offset_t offset;
249 249 ssize_t length;
250 250
251 251 if (va.va_size < bva.va_size) {
252 252 offset = va.va_size;
253 253 length = bva.va_size - va.va_size;
254 254 } else {
255 255 offset = bva.va_size;
256 256 length = va.va_size - bva.va_size;
257 257 }
258 258 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
259 259 NULL)) {
260 260 error = EACCES;
261 261 }
262 262 }
263 263
264 264 if (crgetuid(cr) == bva.va_uid && !error &&
265 265 va.va_size != bva.va_size) {
266 266 va.va_mask &= ~AT_SIZE;
267 267 bf.l_type = F_WRLCK;
268 268 bf.l_whence = 0;
269 269 bf.l_start = (off64_t)va.va_size;
270 270 bf.l_len = 0;
271 271 bf.l_sysid = 0;
272 272 bf.l_pid = 0;
273 273
274 274 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
275 275 (offset_t)va.va_size, cr, &ct);
276 276 }
277 277 if (in_crit)
278 278 nbl_end_crit(vp);
279 279 } else
280 280 error = 0;
281 281
282 282 /*
283 283 * Do the setattr.
284 284 */
285 285 if (!error && va.va_mask) {
286 286 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
287 287 }
288 288
289 289 /*
290 290 * check if the monitor on either vop_space or vop_setattr detected
291 291 * a delegation conflict and if so, mark the thread flag as
292 292 * wouldblock so that the response is dropped and the client will
293 293 * try again.
294 294 */
295 295 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
296 296 VN_RELE(vp);
297 297 curthread->t_flag |= T_WOULDBLOCK;
298 298 return;
299 299 }
300 300
301 301 if (!error) {
302 302 va.va_mask = AT_ALL; /* get everything */
303 303
304 304 error = rfs4_delegated_getattr(vp, &va, 0, cr);
305 305
306 306 /* check for overflows */
307 307 if (!error) {
308 308 acl_perm(vp, exi, &va, cr);
309 309 error = vattr_to_nattr(&va, &ns->ns_attr);
310 310 }
311 311 }
312 312
313 313 ct.cc_flags = 0;
314 314
315 315 /*
316 316 * Force modified metadata out to stable storage.
317 317 */
318 318 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
319 319
320 320 VN_RELE(vp);
321 321
322 322 ns->ns_status = puterrno(error);
323 323 }
324 324 void *
325 325 rfs_setattr_getfh(struct nfssaargs *args)
326 326 {
327 327 return (&args->saa_fh);
328 328 }
329 329
330 330 /*
331 331 * Directory lookup.
332 332 * Returns an fhandle and file attributes for file name in a directory.
333 333 */
334 334 /* ARGSUSED */
335 335 void
336 336 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
337 337 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
338 338 {
339 339 int error;
340 340 vnode_t *dvp;
341 341 vnode_t *vp;
342 342 struct vattr va;
343 343 fhandle_t *fhp = da->da_fhandle;
344 344 struct sec_ol sec = {0, 0};
345 345 bool_t publicfh_flag = FALSE, auth_weak = FALSE;
346 346 char *name;
347 347 struct sockaddr *ca;
348 348
349 349 /*
350 350 * Trusted Extension doesn't support NFSv2. MOUNT
351 351 * will reject v2 clients. Need to prevent v2 client
352 352 * access via WebNFS here.
353 353 */
354 354 if (is_system_labeled() && req->rq_vers == 2) {
355 355 dr->dr_status = NFSERR_ACCES;
356 356 return;
357 357 }
358 358
359 359 /*
360 360 * Disallow NULL paths
361 361 */
362 362 if (da->da_name == NULL || *da->da_name == '\0') {
363 363 dr->dr_status = NFSERR_ACCES;
364 364 return;
365 365 }
366 366
367 367 /*
368 368 * Allow lookups from the root - the default
369 369 * location of the public filehandle.
370 370 */
371 371 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
372 372 dvp = rootdir;
373 373 VN_HOLD(dvp);
374 374 } else {
375 375 dvp = nfs_fhtovp(fhp, exi);
376 376 if (dvp == NULL) {
377 377 dr->dr_status = NFSERR_STALE;
378 378 return;
379 379 }
380 380 }
381 381
382 382 /*
383 383 * Not allow lookup beyond root.
384 384 * If the filehandle matches a filehandle of the exi,
385 385 * then the ".." refers beyond the root of an exported filesystem.
386 386 */
387 387 if (strcmp(da->da_name, "..") == 0 &&
388 388 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
389 389 VN_RELE(dvp);
390 390 dr->dr_status = NFSERR_NOENT;
391 391 return;
392 392 }
393 393
394 394 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
395 395 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
396 396 MAXPATHLEN);
397 397
398 398 if (name == NULL) {
399 399 dr->dr_status = NFSERR_ACCES;
400 400 return;
401 401 }
402 402
403 403 /*
404 404 * If the public filehandle is used then allow
405 405 * a multi-component lookup, i.e. evaluate
406 406 * a pathname and follow symbolic links if
407 407 * necessary.
408 408 *
409 409 * This may result in a vnode in another filesystem
410 410 * which is OK as long as the filesystem is exported.
411 411 */
412 412 if (PUBLIC_FH2(fhp)) {
413 413 publicfh_flag = TRUE;
414 414 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
415 415 &sec);
416 416 } else {
417 417 /*
418 418 * Do a normal single component lookup.
419 419 */
420 420 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
421 421 NULL, NULL, NULL);
422 422 }
423 423
424 424 if (name != da->da_name)
425 425 kmem_free(name, MAXPATHLEN);
426 426
427 427
428 428 if (!error) {
429 429 va.va_mask = AT_ALL; /* we want everything */
430 430
431 431 error = rfs4_delegated_getattr(vp, &va, 0, cr);
432 432
433 433 /* check for overflows */
434 434 if (!error) {
435 435 acl_perm(vp, exi, &va, cr);
436 436 error = vattr_to_nattr(&va, &dr->dr_attr);
437 437 if (!error) {
438 438 if (sec.sec_flags & SEC_QUERY)
439 439 error = makefh_ol(&dr->dr_fhandle, exi,
440 440 sec.sec_index);
441 441 else {
442 442 error = makefh(&dr->dr_fhandle, vp,
443 443 exi);
444 444 if (!error && publicfh_flag &&
445 445 !chk_clnt_sec(exi, req))
446 446 auth_weak = TRUE;
447 447 }
448 448 }
449 449 }
450 450 VN_RELE(vp);
451 451 }
452 452
453 453 VN_RELE(dvp);
454 454
455 455 /*
456 456 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
457 457 * and have obtained a new exportinfo in exi which needs to be
458 458 * released. Note the the original exportinfo pointed to by exi
459 459 * will be released by the caller, comon_dispatch.
460 460 */
461 461 if (publicfh_flag && exi != NULL)
462 462 exi_rele(exi);
463 463
464 464 /*
465 465 * If it's public fh, no 0x81, and client's flavor is
466 466 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
467 467 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
468 468 */
469 469 if (auth_weak)
470 470 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
471 471 else
472 472 dr->dr_status = puterrno(error);
473 473 }
474 474 void *
475 475 rfs_lookup_getfh(struct nfsdiropargs *da)
476 476 {
477 477 return (da->da_fhandle);
478 478 }
479 479
480 480 /*
481 481 * Read symbolic link.
482 482 * Returns the string in the symbolic link at the given fhandle.
483 483 */
484 484 /* ARGSUSED */
485 485 void
486 486 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
487 487 struct svc_req *req, cred_t *cr)
488 488 {
489 489 int error;
490 490 struct iovec iov;
491 491 struct uio uio;
492 492 vnode_t *vp;
493 493 struct vattr va;
494 494 struct sockaddr *ca;
495 495 char *name = NULL;
496 496 int is_referral = 0;
497 497
498 498 vp = nfs_fhtovp(fhp, exi);
499 499 if (vp == NULL) {
500 500 rl->rl_data = NULL;
501 501 rl->rl_status = NFSERR_STALE;
502 502 return;
503 503 }
504 504
505 505 va.va_mask = AT_MODE;
506 506
507 507 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
508 508
509 509 if (error) {
510 510 VN_RELE(vp);
511 511 rl->rl_data = NULL;
512 512 rl->rl_status = puterrno(error);
513 513 return;
514 514 }
515 515
516 516 if (MANDLOCK(vp, va.va_mode)) {
517 517 VN_RELE(vp);
518 518 rl->rl_data = NULL;
519 519 rl->rl_status = NFSERR_ACCES;
520 520 return;
521 521 }
522 522
523 523 /* We lied about the object type for a referral */
524 524 if (vn_is_nfs_reparse(vp, cr))
525 525 is_referral = 1;
526 526
527 527 /*
528 528 * XNFS and RFC1094 require us to return ENXIO if argument
529 529 * is not a link. BUGID 1138002.
530 530 */
531 531 if (vp->v_type != VLNK && !is_referral) {
532 532 VN_RELE(vp);
533 533 rl->rl_data = NULL;
534 534 rl->rl_status = NFSERR_NXIO;
535 535 return;
536 536 }
537 537
538 538 /*
539 539 * Allocate data for pathname. This will be freed by rfs_rlfree.
540 540 */
541 541 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
542 542
543 543 if (is_referral) {
544 544 char *s;
545 545 size_t strsz;
546 546
547 547 /* Get an artificial symlink based on a referral */
548 548 s = build_symlink(vp, cr, &strsz);
549 549 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
550 550 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
551 551 vnode_t *, vp, char *, s);
552 552 if (s == NULL)
553 553 error = EINVAL;
554 554 else {
555 555 error = 0;
556 556 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
557 557 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
558 558 kmem_free(s, strsz);
559 559 }
560 560
561 561 } else {
562 562
563 563 /*
564 564 * Set up io vector to read sym link data
565 565 */
566 566 iov.iov_base = rl->rl_data;
567 567 iov.iov_len = NFS_MAXPATHLEN;
568 568 uio.uio_iov = &iov;
569 569 uio.uio_iovcnt = 1;
570 570 uio.uio_segflg = UIO_SYSSPACE;
571 571 uio.uio_extflg = UIO_COPY_CACHED;
572 572 uio.uio_loffset = (offset_t)0;
573 573 uio.uio_resid = NFS_MAXPATHLEN;
574 574
575 575 /*
576 576 * Do the readlink.
577 577 */
578 578 error = VOP_READLINK(vp, &uio, cr, NULL);
579 579
580 580 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
581 581
582 582 if (!error)
583 583 rl->rl_data[rl->rl_count] = '\0';
584 584
585 585 }
586 586
587 587
588 588 VN_RELE(vp);
589 589
590 590 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
591 591 name = nfscmd_convname(ca, exi, rl->rl_data,
592 592 NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
593 593
594 594 if (name != NULL && name != rl->rl_data) {
595 595 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
596 596 rl->rl_data = name;
597 597 }
598 598
599 599 /*
600 600 * XNFS and RFC1094 require us to return ENXIO if argument
601 601 * is not a link. UFS returns EINVAL if this is the case,
602 602 * so we do the mapping here. BUGID 1138002.
603 603 */
604 604 if (error == EINVAL)
605 605 rl->rl_status = NFSERR_NXIO;
606 606 else
607 607 rl->rl_status = puterrno(error);
608 608
609 609 }
610 610 void *
611 611 rfs_readlink_getfh(fhandle_t *fhp)
612 612 {
613 613 return (fhp);
614 614 }
615 615 /*
616 616 * Free data allocated by rfs_readlink
617 617 */
618 618 void
619 619 rfs_rlfree(struct nfsrdlnres *rl)
620 620 {
621 621 if (rl->rl_data != NULL)
622 622 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
623 623 }
624 624
625 625 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
626 626
627 627 /*
628 628 * Read data.
629 629 * Returns some data read from the file at the given fhandle.
630 630 */
631 631 /* ARGSUSED */
632 632 void
633 633 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
634 634 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
635 635 {
636 636 vnode_t *vp;
637 637 int error;
638 638 struct vattr va;
639 639 struct iovec iov;
640 640 struct uio uio;
641 641 mblk_t *mp;
642 642 int alloc_err = 0;
643 643 int in_crit = 0;
644 644 caller_context_t ct;
645 645
646 646 vp = nfs_fhtovp(&ra->ra_fhandle, exi);
647 647 if (vp == NULL) {
648 648 rr->rr_data = NULL;
649 649 rr->rr_status = NFSERR_STALE;
650 650 return;
651 651 }
652 652
653 653 if (vp->v_type != VREG) {
654 654 VN_RELE(vp);
655 655 rr->rr_data = NULL;
656 656 rr->rr_status = NFSERR_ISDIR;
657 657 return;
658 658 }
659 659
660 660 ct.cc_sysid = 0;
661 661 ct.cc_pid = 0;
662 662 ct.cc_caller_id = nfs2_srv_caller_id;
663 663 ct.cc_flags = CC_DONTBLOCK;
664 664
665 665 /*
666 666 * Enter the critical region before calling VOP_RWLOCK
667 667 * to avoid a deadlock with write requests.
668 668 */
669 669 if (nbl_need_check(vp)) {
670 670 nbl_start_crit(vp, RW_READER);
671 671 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
672 672 0, NULL)) {
673 673 nbl_end_crit(vp);
674 674 VN_RELE(vp);
675 675 rr->rr_data = NULL;
676 676 rr->rr_status = NFSERR_ACCES;
677 677 return;
678 678 }
679 679 in_crit = 1;
680 680 }
681 681
682 682 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
683 683
684 684 /* check if a monitor detected a delegation conflict */
685 685 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
686 686 VN_RELE(vp);
687 687 /* mark as wouldblock so response is dropped */
688 688 curthread->t_flag |= T_WOULDBLOCK;
689 689
690 690 rr->rr_data = NULL;
691 691 return;
692 692 }
693 693
694 694 va.va_mask = AT_ALL;
695 695
696 696 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
697 697
698 698 if (error) {
699 699 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
700 700 if (in_crit)
701 701 nbl_end_crit(vp);
702 702
703 703 VN_RELE(vp);
704 704 rr->rr_data = NULL;
705 705 rr->rr_status = puterrno(error);
706 706
707 707 return;
708 708 }
709 709
710 710 /*
711 711 * This is a kludge to allow reading of files created
712 712 * with no read permission. The owner of the file
713 713 * is always allowed to read it.
714 714 */
715 715 if (crgetuid(cr) != va.va_uid) {
716 716 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
717 717
718 718 if (error) {
719 719 /*
720 720 * Exec is the same as read over the net because
721 721 * of demand loading.
722 722 */
723 723 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
724 724 }
725 725 if (error) {
726 726 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
727 727 if (in_crit)
728 728 nbl_end_crit(vp);
729 729 VN_RELE(vp);
730 730 rr->rr_data = NULL;
731 731 rr->rr_status = puterrno(error);
732 732
733 733 return;
734 734 }
735 735 }
736 736
737 737 if (MANDLOCK(vp, va.va_mode)) {
738 738 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
739 739 if (in_crit)
740 740 nbl_end_crit(vp);
741 741
742 742 VN_RELE(vp);
743 743 rr->rr_data = NULL;
744 744 rr->rr_status = NFSERR_ACCES;
745 745
746 746 return;
747 747 }
748 748
749 749 rr->rr_ok.rrok_wlist_len = 0;
750 750 rr->rr_ok.rrok_wlist = NULL;
751 751
752 752 if ((u_offset_t)ra->ra_offset >= va.va_size) {
753 753 rr->rr_count = 0;
754 754 rr->rr_data = NULL;
755 755 /*
756 756 * In this case, status is NFS_OK, but there is no data
757 757 * to encode. So set rr_mp to NULL.
758 758 */
759 759 rr->rr_mp = NULL;
760 760 rr->rr_ok.rrok_wlist = ra->ra_wlist;
761 761 if (rr->rr_ok.rrok_wlist)
762 762 clist_zero_len(rr->rr_ok.rrok_wlist);
763 763 goto done;
764 764 }
765 765
766 766 if (ra->ra_wlist) {
767 767 mp = NULL;
768 768 rr->rr_mp = NULL;
769 769 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
770 770 if (ra->ra_count > iov.iov_len) {
771 771 rr->rr_data = NULL;
772 772 rr->rr_status = NFSERR_INVAL;
773 773 goto done;
774 774 }
775 775 } else {
776 776 /*
777 777 * mp will contain the data to be sent out in the read reply.
778 778 * This will be freed after the reply has been sent out (by the
779 779 * driver).
780 780 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
781 781 * that the call to xdrmblk_putmblk() never fails.
782 782 */
783 783 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
784 784 &alloc_err);
785 785 ASSERT(mp != NULL);
786 786 ASSERT(alloc_err == 0);
787 787
788 788 rr->rr_mp = mp;
789 789
790 790 /*
791 791 * Set up io vector
792 792 */
793 793 iov.iov_base = (caddr_t)mp->b_datap->db_base;
794 794 iov.iov_len = ra->ra_count;
795 795 }
796 796
797 797 uio.uio_iov = &iov;
798 798 uio.uio_iovcnt = 1;
799 799 uio.uio_segflg = UIO_SYSSPACE;
800 800 uio.uio_extflg = UIO_COPY_CACHED;
801 801 uio.uio_loffset = (offset_t)ra->ra_offset;
802 802 uio.uio_resid = ra->ra_count;
803 803
804 804 error = VOP_READ(vp, &uio, 0, cr, &ct);
805 805
806 806 if (error) {
807 807 if (mp)
808 808 freeb(mp);
809 809
810 810 /*
811 811 * check if a monitor detected a delegation conflict and
812 812 * mark as wouldblock so response is dropped
813 813 */
814 814 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
815 815 curthread->t_flag |= T_WOULDBLOCK;
816 816 else
817 817 rr->rr_status = puterrno(error);
818 818
819 819 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
820 820 if (in_crit)
821 821 nbl_end_crit(vp);
822 822
823 823 VN_RELE(vp);
824 824 rr->rr_data = NULL;
825 825
826 826 return;
827 827 }
828 828
829 829 /*
830 830 * Get attributes again so we can send the latest access
831 831 * time to the client side for his cache.
832 832 */
833 833 va.va_mask = AT_ALL;
834 834
835 835 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
836 836
837 837 if (error) {
838 838 if (mp)
839 839 freeb(mp);
840 840
841 841 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
842 842 if (in_crit)
843 843 nbl_end_crit(vp);
844 844
845 845 VN_RELE(vp);
846 846 rr->rr_data = NULL;
847 847 rr->rr_status = puterrno(error);
848 848
849 849 return;
850 850 }
851 851
852 852 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
853 853
854 854 if (mp) {
855 855 rr->rr_data = (char *)mp->b_datap->db_base;
856 856 } else {
857 857 if (ra->ra_wlist) {
858 858 rr->rr_data = (caddr_t)iov.iov_base;
859 859 if (!rdma_setup_read_data2(ra, rr)) {
860 860 rr->rr_data = NULL;
861 861 rr->rr_status = puterrno(NFSERR_INVAL);
862 862 }
863 863 }
864 864 }
865 865 done:
866 866 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
867 867 if (in_crit)
868 868 nbl_end_crit(vp);
869 869
870 870 acl_perm(vp, exi, &va, cr);
871 871
872 872 /* check for overflows */
873 873 error = vattr_to_nattr(&va, &rr->rr_attr);
874 874
875 875 VN_RELE(vp);
876 876
877 877 rr->rr_status = puterrno(error);
878 878 }
879 879
880 880 /*
881 881 * Free data allocated by rfs_read
882 882 */
883 883 void
884 884 rfs_rdfree(struct nfsrdresult *rr)
885 885 {
886 886 mblk_t *mp;
887 887
888 888 if (rr->rr_status == NFS_OK) {
889 889 mp = rr->rr_mp;
890 890 if (mp != NULL)
891 891 freeb(mp);
892 892 }
893 893 }
894 894
895 895 void *
896 896 rfs_read_getfh(struct nfsreadargs *ra)
897 897 {
898 898 return (&ra->ra_fhandle);
899 899 }
900 900
901 901 #define MAX_IOVECS 12
902 902
903 903 #ifdef DEBUG
904 904 static int rfs_write_sync_hits = 0;
905 905 static int rfs_write_sync_misses = 0;
906 906 #endif
907 907
908 908 /*
909 909 * Write data to file.
910 910 * Returns attributes of a file after writing some data to it.
911 911 *
912 912 * Any changes made here, especially in error handling might have
913 913 * to also be done in rfs_write (which clusters write requests).
914 914 */
915 915 void
916 916 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
917 917 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
918 918 {
919 919 int error;
920 920 vnode_t *vp;
921 921 rlim64_t rlimit;
922 922 struct vattr va;
923 923 struct uio uio;
924 924 struct iovec iov[MAX_IOVECS];
925 925 mblk_t *m;
926 926 struct iovec *iovp;
927 927 int iovcnt;
928 928 cred_t *savecred;
929 929 int in_crit = 0;
930 930 caller_context_t ct;
931 931
932 932 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
933 933 if (vp == NULL) {
934 934 ns->ns_status = NFSERR_STALE;
935 935 return;
936 936 }
937 937
938 938 if (rdonly(exi, vp, req)) {
939 939 VN_RELE(vp);
940 940 ns->ns_status = NFSERR_ROFS;
941 941 return;
942 942 }
943 943
944 944 if (vp->v_type != VREG) {
945 945 VN_RELE(vp);
946 946 ns->ns_status = NFSERR_ISDIR;
947 947 return;
948 948 }
949 949
950 950 ct.cc_sysid = 0;
951 951 ct.cc_pid = 0;
952 952 ct.cc_caller_id = nfs2_srv_caller_id;
953 953 ct.cc_flags = CC_DONTBLOCK;
954 954
955 955 va.va_mask = AT_UID|AT_MODE;
956 956
957 957 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
958 958
959 959 if (error) {
960 960 VN_RELE(vp);
961 961 ns->ns_status = puterrno(error);
962 962
963 963 return;
964 964 }
965 965
966 966 if (crgetuid(cr) != va.va_uid) {
967 967 /*
968 968 * This is a kludge to allow writes of files created
969 969 * with read only permission. The owner of the file
970 970 * is always allowed to write it.
971 971 */
972 972 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
973 973
974 974 if (error) {
975 975 VN_RELE(vp);
976 976 ns->ns_status = puterrno(error);
977 977 return;
978 978 }
979 979 }
980 980
981 981 /*
982 982 * Can't access a mandatory lock file. This might cause
983 983 * the NFS service thread to block forever waiting for a
984 984 * lock to be released that will never be released.
985 985 */
986 986 if (MANDLOCK(vp, va.va_mode)) {
987 987 VN_RELE(vp);
988 988 ns->ns_status = NFSERR_ACCES;
989 989 return;
990 990 }
991 991
992 992 /*
993 993 * We have to enter the critical region before calling VOP_RWLOCK
994 994 * to avoid a deadlock with ufs.
995 995 */
996 996 if (nbl_need_check(vp)) {
997 997 nbl_start_crit(vp, RW_READER);
998 998 in_crit = 1;
999 999 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1000 1000 wa->wa_count, 0, NULL)) {
1001 1001 error = EACCES;
1002 1002 goto out;
1003 1003 }
1004 1004 }
1005 1005
1006 1006 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1007 1007
1008 1008 /* check if a monitor detected a delegation conflict */
1009 1009 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1010 1010 VN_RELE(vp);
1011 1011 /* mark as wouldblock so response is dropped */
1012 1012 curthread->t_flag |= T_WOULDBLOCK;
1013 1013 return;
1014 1014 }
1015 1015
1016 1016 if (wa->wa_data || wa->wa_rlist) {
1017 1017 /* Do the RDMA thing if necessary */
1018 1018 if (wa->wa_rlist) {
1019 1019 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1020 1020 iov[0].iov_len = wa->wa_count;
1021 1021 } else {
1022 1022 iov[0].iov_base = wa->wa_data;
1023 1023 iov[0].iov_len = wa->wa_count;
1024 1024 }
1025 1025 uio.uio_iov = iov;
1026 1026 uio.uio_iovcnt = 1;
1027 1027 uio.uio_segflg = UIO_SYSSPACE;
1028 1028 uio.uio_extflg = UIO_COPY_DEFAULT;
1029 1029 uio.uio_loffset = (offset_t)wa->wa_offset;
1030 1030 uio.uio_resid = wa->wa_count;
1031 1031 /*
1032 1032 * The limit is checked on the client. We
1033 1033 * should allow any size writes here.
1034 1034 */
1035 1035 uio.uio_llimit = curproc->p_fsz_ctl;
1036 1036 rlimit = uio.uio_llimit - wa->wa_offset;
1037 1037 if (rlimit < (rlim64_t)uio.uio_resid)
1038 1038 uio.uio_resid = (uint_t)rlimit;
1039 1039
1040 1040 /*
1041 1041 * for now we assume no append mode
1042 1042 */
1043 1043 /*
1044 1044 * We're changing creds because VM may fault and we need
1045 1045 * the cred of the current thread to be used if quota
1046 1046 * checking is enabled.
1047 1047 */
1048 1048 savecred = curthread->t_cred;
1049 1049 curthread->t_cred = cr;
1050 1050 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1051 1051 curthread->t_cred = savecred;
1052 1052 } else {
1053 1053 iovcnt = 0;
1054 1054 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1055 1055 iovcnt++;
1056 1056 if (iovcnt <= MAX_IOVECS) {
1057 1057 #ifdef DEBUG
1058 1058 rfs_write_sync_hits++;
1059 1059 #endif
1060 1060 iovp = iov;
1061 1061 } else {
1062 1062 #ifdef DEBUG
1063 1063 rfs_write_sync_misses++;
1064 1064 #endif
1065 1065 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1066 1066 }
1067 1067 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1068 1068 uio.uio_iov = iovp;
1069 1069 uio.uio_iovcnt = iovcnt;
1070 1070 uio.uio_segflg = UIO_SYSSPACE;
1071 1071 uio.uio_extflg = UIO_COPY_DEFAULT;
1072 1072 uio.uio_loffset = (offset_t)wa->wa_offset;
1073 1073 uio.uio_resid = wa->wa_count;
1074 1074 /*
1075 1075 * The limit is checked on the client. We
1076 1076 * should allow any size writes here.
1077 1077 */
1078 1078 uio.uio_llimit = curproc->p_fsz_ctl;
1079 1079 rlimit = uio.uio_llimit - wa->wa_offset;
1080 1080 if (rlimit < (rlim64_t)uio.uio_resid)
1081 1081 uio.uio_resid = (uint_t)rlimit;
1082 1082
1083 1083 /*
1084 1084 * For now we assume no append mode.
1085 1085 */
1086 1086 /*
1087 1087 * We're changing creds because VM may fault and we need
1088 1088 * the cred of the current thread to be used if quota
1089 1089 * checking is enabled.
1090 1090 */
1091 1091 savecred = curthread->t_cred;
1092 1092 curthread->t_cred = cr;
1093 1093 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1094 1094 curthread->t_cred = savecred;
1095 1095
1096 1096 if (iovp != iov)
1097 1097 kmem_free(iovp, sizeof (*iovp) * iovcnt);
1098 1098 }
1099 1099
1100 1100 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1101 1101
1102 1102 if (!error) {
1103 1103 /*
1104 1104 * Get attributes again so we send the latest mod
1105 1105 * time to the client side for his cache.
1106 1106 */
1107 1107 va.va_mask = AT_ALL; /* now we want everything */
1108 1108
1109 1109 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1110 1110
1111 1111 /* check for overflows */
1112 1112 if (!error) {
1113 1113 acl_perm(vp, exi, &va, cr);
1114 1114 error = vattr_to_nattr(&va, &ns->ns_attr);
1115 1115 }
1116 1116 }
1117 1117
1118 1118 out:
1119 1119 if (in_crit)
1120 1120 nbl_end_crit(vp);
1121 1121 VN_RELE(vp);
1122 1122
1123 1123 /* check if a monitor detected a delegation conflict */
1124 1124 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1125 1125 /* mark as wouldblock so response is dropped */
1126 1126 curthread->t_flag |= T_WOULDBLOCK;
1127 1127 else
1128 1128 ns->ns_status = puterrno(error);
1129 1129
1130 1130 }
1131 1131
1132 1132 struct rfs_async_write {
1133 1133 struct nfswriteargs *wa;
1134 1134 struct nfsattrstat *ns;
1135 1135 struct svc_req *req;
1136 1136 cred_t *cr;
1137 1137 kthread_t *thread;
1138 1138 struct rfs_async_write *list;
1139 1139 };
1140 1140
1141 1141 struct rfs_async_write_list {
1142 1142 fhandle_t *fhp;
1143 1143 kcondvar_t cv;
1144 1144 struct rfs_async_write *list;
1145 1145 struct rfs_async_write_list *next;
1146 1146 };
1147 1147
1148 1148 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1149 1149 static kmutex_t rfs_async_write_lock;
1150 1150 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1151 1151
1152 1152 #define MAXCLIOVECS 42
1153 1153 #define RFSWRITE_INITVAL (enum nfsstat) -1
1154 1154
1155 1155 #ifdef DEBUG
1156 1156 static int rfs_write_hits = 0;
1157 1157 static int rfs_write_misses = 0;
1158 1158 #endif
1159 1159
1160 1160 /*
1161 1161 * Write data to file.
1162 1162 * Returns attributes of a file after writing some data to it.
1163 1163 */
1164 1164 void
1165 1165 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1166 1166 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1167 1167 {
1168 1168 int error;
1169 1169 vnode_t *vp;
1170 1170 rlim64_t rlimit;
1171 1171 struct vattr va;
1172 1172 struct uio uio;
1173 1173 struct rfs_async_write_list *lp;
1174 1174 struct rfs_async_write_list *nlp;
1175 1175 struct rfs_async_write *rp;
1176 1176 struct rfs_async_write *nrp;
1177 1177 struct rfs_async_write *trp;
1178 1178 struct rfs_async_write *lrp;
1179 1179 int data_written;
1180 1180 int iovcnt;
1181 1181 mblk_t *m;
1182 1182 struct iovec *iovp;
1183 1183 struct iovec *niovp;
1184 1184 struct iovec iov[MAXCLIOVECS];
1185 1185 int count;
1186 1186 int rcount;
1187 1187 uint_t off;
1188 1188 uint_t len;
1189 1189 struct rfs_async_write nrpsp;
1190 1190 struct rfs_async_write_list nlpsp;
1191 1191 ushort_t t_flag;
1192 1192 cred_t *savecred;
1193 1193 int in_crit = 0;
1194 1194 caller_context_t ct;
1195 1195
1196 1196 if (!rfs_write_async) {
1197 1197 rfs_write_sync(wa, ns, exi, req, cr);
1198 1198 return;
1199 1199 }
1200 1200
1201 1201 /*
1202 1202 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1203 1203 * is considered an OK.
↓ open down ↓ |
1203 lines elided |
↑ open up ↑ |
1204 1204 */
1205 1205 ns->ns_status = RFSWRITE_INITVAL;
1206 1206
1207 1207 nrp = &nrpsp;
1208 1208 nrp->wa = wa;
1209 1209 nrp->ns = ns;
1210 1210 nrp->req = req;
1211 1211 nrp->cr = cr;
1212 1212 nrp->thread = curthread;
1213 1213
1214 - ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1215 -
1216 1214 /*
1217 1215 * Look to see if there is already a cluster started
1218 1216 * for this file.
1219 1217 */
1220 1218 mutex_enter(&rfs_async_write_lock);
1221 1219 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1222 1220 if (bcmp(&wa->wa_fhandle, lp->fhp,
1223 1221 sizeof (fhandle_t)) == 0)
1224 1222 break;
1225 1223 }
1226 1224
1227 1225 /*
1228 1226 * If lp is non-NULL, then there is already a cluster
1229 1227 * started. We need to place ourselves in the cluster
1230 1228 * list in the right place as determined by starting
1231 1229 * offset. Conflicts with non-blocking mandatory locked
1232 1230 * regions will be checked when the cluster is processed.
1233 1231 */
1234 1232 if (lp != NULL) {
1235 1233 rp = lp->list;
1236 1234 trp = NULL;
1237 1235 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1238 1236 trp = rp;
1239 1237 rp = rp->list;
1240 1238 }
1241 1239 nrp->list = rp;
1242 1240 if (trp == NULL)
1243 1241 lp->list = nrp;
1244 1242 else
1245 1243 trp->list = nrp;
1246 1244 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1247 1245 cv_wait(&lp->cv, &rfs_async_write_lock);
1248 1246 mutex_exit(&rfs_async_write_lock);
1249 1247
1250 1248 return;
1251 1249 }
1252 1250
1253 1251 /*
1254 1252 * No cluster started yet, start one and add ourselves
1255 1253 * to the list of clusters.
1256 1254 */
1257 1255 nrp->list = NULL;
1258 1256
1259 1257 nlp = &nlpsp;
1260 1258 nlp->fhp = &wa->wa_fhandle;
1261 1259 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1262 1260 nlp->list = nrp;
1263 1261 nlp->next = NULL;
1264 1262
1265 1263 if (rfs_async_write_head == NULL) {
1266 1264 rfs_async_write_head = nlp;
1267 1265 } else {
1268 1266 lp = rfs_async_write_head;
1269 1267 while (lp->next != NULL)
1270 1268 lp = lp->next;
1271 1269 lp->next = nlp;
1272 1270 }
1273 1271 mutex_exit(&rfs_async_write_lock);
1274 1272
1275 1273 /*
1276 1274 * Convert the file handle common to all of the requests
1277 1275 * in this cluster to a vnode.
1278 1276 */
1279 1277 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1280 1278 if (vp == NULL) {
1281 1279 mutex_enter(&rfs_async_write_lock);
1282 1280 if (rfs_async_write_head == nlp)
1283 1281 rfs_async_write_head = nlp->next;
1284 1282 else {
1285 1283 lp = rfs_async_write_head;
1286 1284 while (lp->next != nlp)
1287 1285 lp = lp->next;
1288 1286 lp->next = nlp->next;
1289 1287 }
1290 1288 t_flag = curthread->t_flag & T_WOULDBLOCK;
1291 1289 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1292 1290 rp->ns->ns_status = NFSERR_STALE;
1293 1291 rp->thread->t_flag |= t_flag;
1294 1292 }
1295 1293 cv_broadcast(&nlp->cv);
1296 1294 mutex_exit(&rfs_async_write_lock);
1297 1295
1298 1296 return;
1299 1297 }
1300 1298
1301 1299 /*
1302 1300 * Can only write regular files. Attempts to write any
1303 1301 * other file types fail with EISDIR.
1304 1302 */
1305 1303 if (vp->v_type != VREG) {
1306 1304 VN_RELE(vp);
1307 1305 mutex_enter(&rfs_async_write_lock);
1308 1306 if (rfs_async_write_head == nlp)
1309 1307 rfs_async_write_head = nlp->next;
1310 1308 else {
1311 1309 lp = rfs_async_write_head;
1312 1310 while (lp->next != nlp)
1313 1311 lp = lp->next;
1314 1312 lp->next = nlp->next;
1315 1313 }
1316 1314 t_flag = curthread->t_flag & T_WOULDBLOCK;
1317 1315 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1318 1316 rp->ns->ns_status = NFSERR_ISDIR;
1319 1317 rp->thread->t_flag |= t_flag;
1320 1318 }
1321 1319 cv_broadcast(&nlp->cv);
1322 1320 mutex_exit(&rfs_async_write_lock);
1323 1321
1324 1322 return;
1325 1323 }
1326 1324
1327 1325 /*
1328 1326 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1329 1327 * deadlock with ufs.
1330 1328 */
1331 1329 if (nbl_need_check(vp)) {
1332 1330 nbl_start_crit(vp, RW_READER);
1333 1331 in_crit = 1;
1334 1332 }
1335 1333
1336 1334 ct.cc_sysid = 0;
1337 1335 ct.cc_pid = 0;
1338 1336 ct.cc_caller_id = nfs2_srv_caller_id;
1339 1337 ct.cc_flags = CC_DONTBLOCK;
1340 1338
1341 1339 /*
1342 1340 * Lock the file for writing. This operation provides
1343 1341 * the delay which allows clusters to grow.
1344 1342 */
1345 1343 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1346 1344
1347 1345 /* check if a monitor detected a delegation conflict */
1348 1346 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1349 1347 if (in_crit)
1350 1348 nbl_end_crit(vp);
1351 1349 VN_RELE(vp);
1352 1350 /* mark as wouldblock so response is dropped */
1353 1351 curthread->t_flag |= T_WOULDBLOCK;
1354 1352 mutex_enter(&rfs_async_write_lock);
1355 1353 if (rfs_async_write_head == nlp)
1356 1354 rfs_async_write_head = nlp->next;
1357 1355 else {
1358 1356 lp = rfs_async_write_head;
1359 1357 while (lp->next != nlp)
1360 1358 lp = lp->next;
1361 1359 lp->next = nlp->next;
1362 1360 }
1363 1361 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1364 1362 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1365 1363 rp->ns->ns_status = puterrno(error);
1366 1364 rp->thread->t_flag |= T_WOULDBLOCK;
1367 1365 }
1368 1366 }
1369 1367 cv_broadcast(&nlp->cv);
1370 1368 mutex_exit(&rfs_async_write_lock);
1371 1369
1372 1370 return;
1373 1371 }
1374 1372
1375 1373 /*
1376 1374 * Disconnect this cluster from the list of clusters.
1377 1375 * The cluster that is being dealt with must be fixed
1378 1376 * in size after this point, so there is no reason
1379 1377 * to leave it on the list so that new requests can
1380 1378 * find it.
1381 1379 *
1382 1380 * The algorithm is that the first write request will
1383 1381 * create a cluster, convert the file handle to a
1384 1382 * vnode pointer, and then lock the file for writing.
1385 1383 * This request is not likely to be clustered with
1386 1384 * any others. However, the next request will create
1387 1385 * a new cluster and be blocked in VOP_RWLOCK while
1388 1386 * the first request is being processed. This delay
1389 1387 * will allow more requests to be clustered in this
1390 1388 * second cluster.
1391 1389 */
1392 1390 mutex_enter(&rfs_async_write_lock);
1393 1391 if (rfs_async_write_head == nlp)
1394 1392 rfs_async_write_head = nlp->next;
1395 1393 else {
1396 1394 lp = rfs_async_write_head;
1397 1395 while (lp->next != nlp)
1398 1396 lp = lp->next;
1399 1397 lp->next = nlp->next;
1400 1398 }
1401 1399 mutex_exit(&rfs_async_write_lock);
1402 1400
1403 1401 /*
1404 1402 * Step through the list of requests in this cluster.
1405 1403 * We need to check permissions to make sure that all
1406 1404 * of the requests have sufficient permission to write
1407 1405 * the file. A cluster can be composed of requests
1408 1406 * from different clients and different users on each
1409 1407 * client.
1410 1408 *
1411 1409 * As a side effect, we also calculate the size of the
1412 1410 * byte range that this cluster encompasses.
1413 1411 */
1414 1412 rp = nlp->list;
1415 1413 off = rp->wa->wa_offset;
1416 1414 len = (uint_t)0;
1417 1415 do {
1418 1416 if (rdonly(exi, vp, rp->req)) {
1419 1417 rp->ns->ns_status = NFSERR_ROFS;
1420 1418 t_flag = curthread->t_flag & T_WOULDBLOCK;
1421 1419 rp->thread->t_flag |= t_flag;
1422 1420 continue;
1423 1421 }
1424 1422
1425 1423 va.va_mask = AT_UID|AT_MODE;
1426 1424
1427 1425 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1428 1426
1429 1427 if (!error) {
1430 1428 if (crgetuid(rp->cr) != va.va_uid) {
1431 1429 /*
1432 1430 * This is a kludge to allow writes of files
1433 1431 * created with read only permission. The
1434 1432 * owner of the file is always allowed to
1435 1433 * write it.
1436 1434 */
1437 1435 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1438 1436 }
1439 1437 if (!error && MANDLOCK(vp, va.va_mode))
1440 1438 error = EACCES;
1441 1439 }
1442 1440
1443 1441 /*
1444 1442 * Check for a conflict with a nbmand-locked region.
1445 1443 */
1446 1444 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1447 1445 rp->wa->wa_count, 0, NULL)) {
1448 1446 error = EACCES;
1449 1447 }
1450 1448
1451 1449 if (error) {
1452 1450 rp->ns->ns_status = puterrno(error);
1453 1451 t_flag = curthread->t_flag & T_WOULDBLOCK;
1454 1452 rp->thread->t_flag |= t_flag;
1455 1453 continue;
1456 1454 }
1457 1455 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1458 1456 len = rp->wa->wa_offset + rp->wa->wa_count - off;
1459 1457 } while ((rp = rp->list) != NULL);
1460 1458
1461 1459 /*
1462 1460 * Step through the cluster attempting to gather as many
1463 1461 * requests which are contiguous as possible. These
1464 1462 * contiguous requests are handled via one call to VOP_WRITE
1465 1463 * instead of different calls to VOP_WRITE. We also keep
1466 1464 * track of the fact that any data was written.
1467 1465 */
1468 1466 rp = nlp->list;
1469 1467 data_written = 0;
1470 1468 do {
1471 1469 /*
1472 1470 * Skip any requests which are already marked as having an
1473 1471 * error.
1474 1472 */
1475 1473 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1476 1474 rp = rp->list;
1477 1475 continue;
1478 1476 }
1479 1477
1480 1478 /*
1481 1479 * Count the number of iovec's which are required
1482 1480 * to handle this set of requests. One iovec is
1483 1481 * needed for each data buffer, whether addressed
1484 1482 * by wa_data or by the b_rptr pointers in the
1485 1483 * mblk chains.
1486 1484 */
1487 1485 iovcnt = 0;
1488 1486 lrp = rp;
1489 1487 for (;;) {
1490 1488 if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1491 1489 iovcnt++;
1492 1490 else {
1493 1491 m = lrp->wa->wa_mblk;
1494 1492 while (m != NULL) {
1495 1493 iovcnt++;
1496 1494 m = m->b_cont;
1497 1495 }
1498 1496 }
1499 1497 if (lrp->list == NULL ||
1500 1498 lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1501 1499 lrp->wa->wa_offset + lrp->wa->wa_count !=
1502 1500 lrp->list->wa->wa_offset) {
1503 1501 lrp = lrp->list;
1504 1502 break;
1505 1503 }
1506 1504 lrp = lrp->list;
1507 1505 }
1508 1506
1509 1507 if (iovcnt <= MAXCLIOVECS) {
1510 1508 #ifdef DEBUG
1511 1509 rfs_write_hits++;
1512 1510 #endif
1513 1511 niovp = iov;
1514 1512 } else {
1515 1513 #ifdef DEBUG
1516 1514 rfs_write_misses++;
1517 1515 #endif
1518 1516 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1519 1517 }
1520 1518 /*
1521 1519 * Put together the scatter/gather iovecs.
1522 1520 */
1523 1521 iovp = niovp;
1524 1522 trp = rp;
1525 1523 count = 0;
1526 1524 do {
1527 1525 if (trp->wa->wa_data || trp->wa->wa_rlist) {
1528 1526 if (trp->wa->wa_rlist) {
1529 1527 iovp->iov_base =
1530 1528 (char *)((trp->wa->wa_rlist)->
1531 1529 u.c_daddr3);
1532 1530 iovp->iov_len = trp->wa->wa_count;
1533 1531 } else {
1534 1532 iovp->iov_base = trp->wa->wa_data;
1535 1533 iovp->iov_len = trp->wa->wa_count;
1536 1534 }
1537 1535 iovp++;
1538 1536 } else {
1539 1537 m = trp->wa->wa_mblk;
1540 1538 rcount = trp->wa->wa_count;
1541 1539 while (m != NULL) {
1542 1540 iovp->iov_base = (caddr_t)m->b_rptr;
1543 1541 iovp->iov_len = (m->b_wptr - m->b_rptr);
1544 1542 rcount -= iovp->iov_len;
1545 1543 if (rcount < 0)
1546 1544 iovp->iov_len += rcount;
1547 1545 iovp++;
1548 1546 if (rcount <= 0)
1549 1547 break;
1550 1548 m = m->b_cont;
1551 1549 }
1552 1550 }
1553 1551 count += trp->wa->wa_count;
1554 1552 trp = trp->list;
1555 1553 } while (trp != lrp);
1556 1554
1557 1555 uio.uio_iov = niovp;
1558 1556 uio.uio_iovcnt = iovcnt;
1559 1557 uio.uio_segflg = UIO_SYSSPACE;
1560 1558 uio.uio_extflg = UIO_COPY_DEFAULT;
1561 1559 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1562 1560 uio.uio_resid = count;
1563 1561 /*
1564 1562 * The limit is checked on the client. We
1565 1563 * should allow any size writes here.
1566 1564 */
1567 1565 uio.uio_llimit = curproc->p_fsz_ctl;
1568 1566 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1569 1567 if (rlimit < (rlim64_t)uio.uio_resid)
1570 1568 uio.uio_resid = (uint_t)rlimit;
1571 1569
1572 1570 /*
1573 1571 * For now we assume no append mode.
1574 1572 */
1575 1573
1576 1574 /*
1577 1575 * We're changing creds because VM may fault
1578 1576 * and we need the cred of the current
1579 1577 * thread to be used if quota * checking is
1580 1578 * enabled.
1581 1579 */
1582 1580 savecred = curthread->t_cred;
1583 1581 curthread->t_cred = cr;
1584 1582 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1585 1583 curthread->t_cred = savecred;
1586 1584
1587 1585 /* check if a monitor detected a delegation conflict */
1588 1586 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1589 1587 /* mark as wouldblock so response is dropped */
1590 1588 curthread->t_flag |= T_WOULDBLOCK;
1591 1589
1592 1590 if (niovp != iov)
1593 1591 kmem_free(niovp, sizeof (*niovp) * iovcnt);
1594 1592
1595 1593 if (!error) {
1596 1594 data_written = 1;
1597 1595 /*
1598 1596 * Get attributes again so we send the latest mod
1599 1597 * time to the client side for his cache.
1600 1598 */
1601 1599 va.va_mask = AT_ALL; /* now we want everything */
1602 1600
1603 1601 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1604 1602
1605 1603 if (!error)
1606 1604 acl_perm(vp, exi, &va, rp->cr);
1607 1605 }
1608 1606
1609 1607 /*
1610 1608 * Fill in the status responses for each request
1611 1609 * which was just handled. Also, copy the latest
1612 1610 * attributes in to the attribute responses if
1613 1611 * appropriate.
1614 1612 */
1615 1613 t_flag = curthread->t_flag & T_WOULDBLOCK;
1616 1614 do {
1617 1615 rp->thread->t_flag |= t_flag;
1618 1616 /* check for overflows */
1619 1617 if (!error) {
1620 1618 error = vattr_to_nattr(&va, &rp->ns->ns_attr);
1621 1619 }
1622 1620 rp->ns->ns_status = puterrno(error);
1623 1621 rp = rp->list;
1624 1622 } while (rp != lrp);
1625 1623 } while (rp != NULL);
1626 1624
1627 1625 /*
1628 1626 * If any data was written at all, then we need to flush
1629 1627 * the data and metadata to stable storage.
1630 1628 */
1631 1629 if (data_written) {
1632 1630 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1633 1631
1634 1632 if (!error) {
1635 1633 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1636 1634 }
1637 1635 }
1638 1636
1639 1637 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1640 1638
1641 1639 if (in_crit)
1642 1640 nbl_end_crit(vp);
1643 1641 VN_RELE(vp);
1644 1642
1645 1643 t_flag = curthread->t_flag & T_WOULDBLOCK;
1646 1644 mutex_enter(&rfs_async_write_lock);
1647 1645 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1648 1646 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1649 1647 rp->ns->ns_status = puterrno(error);
1650 1648 rp->thread->t_flag |= t_flag;
1651 1649 }
1652 1650 }
1653 1651 cv_broadcast(&nlp->cv);
1654 1652 mutex_exit(&rfs_async_write_lock);
1655 1653
1656 1654 }
1657 1655
1658 1656 void *
1659 1657 rfs_write_getfh(struct nfswriteargs *wa)
1660 1658 {
1661 1659 return (&wa->wa_fhandle);
1662 1660 }
1663 1661
1664 1662 /*
1665 1663 * Create a file.
1666 1664 * Creates a file with given attributes and returns those attributes
1667 1665 * and an fhandle for the new file.
1668 1666 */
1669 1667 void
1670 1668 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1671 1669 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1672 1670 {
1673 1671 int error;
1674 1672 int lookuperr;
1675 1673 int in_crit = 0;
1676 1674 struct vattr va;
1677 1675 vnode_t *vp;
1678 1676 vnode_t *realvp;
1679 1677 vnode_t *dvp;
1680 1678 char *name = args->ca_da.da_name;
1681 1679 vnode_t *tvp = NULL;
1682 1680 int mode;
1683 1681 int lookup_ok;
1684 1682 bool_t trunc;
1685 1683 struct sockaddr *ca;
1686 1684
1687 1685 /*
1688 1686 * Disallow NULL paths
1689 1687 */
1690 1688 if (name == NULL || *name == '\0') {
1691 1689 dr->dr_status = NFSERR_ACCES;
1692 1690 return;
1693 1691 }
1694 1692
1695 1693 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1696 1694 if (dvp == NULL) {
1697 1695 dr->dr_status = NFSERR_STALE;
1698 1696 return;
1699 1697 }
1700 1698
1701 1699 error = sattr_to_vattr(args->ca_sa, &va);
1702 1700 if (error) {
1703 1701 dr->dr_status = puterrno(error);
1704 1702 return;
1705 1703 }
1706 1704
1707 1705 /*
1708 1706 * Must specify the mode.
1709 1707 */
1710 1708 if (!(va.va_mask & AT_MODE)) {
1711 1709 VN_RELE(dvp);
1712 1710 dr->dr_status = NFSERR_INVAL;
1713 1711 return;
1714 1712 }
1715 1713
1716 1714 /*
1717 1715 * This is a completely gross hack to make mknod
1718 1716 * work over the wire until we can wack the protocol
1719 1717 */
1720 1718 if ((va.va_mode & IFMT) == IFCHR) {
1721 1719 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1722 1720 va.va_type = VFIFO; /* xtra kludge for named pipe */
1723 1721 else {
1724 1722 va.va_type = VCHR;
1725 1723 /*
1726 1724 * uncompress the received dev_t
1727 1725 * if the top half is zero indicating a request
1728 1726 * from an `older style' OS.
1729 1727 */
1730 1728 if ((va.va_size & 0xffff0000) == 0)
1731 1729 va.va_rdev = nfsv2_expdev(va.va_size);
1732 1730 else
1733 1731 va.va_rdev = (dev_t)va.va_size;
1734 1732 }
1735 1733 va.va_mask &= ~AT_SIZE;
1736 1734 } else if ((va.va_mode & IFMT) == IFBLK) {
1737 1735 va.va_type = VBLK;
1738 1736 /*
1739 1737 * uncompress the received dev_t
1740 1738 * if the top half is zero indicating a request
1741 1739 * from an `older style' OS.
1742 1740 */
1743 1741 if ((va.va_size & 0xffff0000) == 0)
1744 1742 va.va_rdev = nfsv2_expdev(va.va_size);
1745 1743 else
1746 1744 va.va_rdev = (dev_t)va.va_size;
1747 1745 va.va_mask &= ~AT_SIZE;
1748 1746 } else if ((va.va_mode & IFMT) == IFSOCK) {
1749 1747 va.va_type = VSOCK;
1750 1748 } else {
1751 1749 va.va_type = VREG;
1752 1750 }
1753 1751 va.va_mode &= ~IFMT;
1754 1752 va.va_mask |= AT_TYPE;
1755 1753
1756 1754 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1757 1755 name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1758 1756 MAXPATHLEN);
1759 1757 if (name == NULL) {
1760 1758 dr->dr_status = puterrno(EINVAL);
1761 1759 return;
1762 1760 }
1763 1761
1764 1762 /*
1765 1763 * Why was the choice made to use VWRITE as the mode to the
1766 1764 * call to VOP_CREATE ? This results in a bug. When a client
1767 1765 * opens a file that already exists and is RDONLY, the second
1768 1766 * open fails with an EACESS because of the mode.
1769 1767 * bug ID 1054648.
1770 1768 */
1771 1769 lookup_ok = 0;
1772 1770 mode = VWRITE;
1773 1771 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1774 1772 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1775 1773 NULL, NULL, NULL);
1776 1774 if (!error) {
1777 1775 struct vattr at;
1778 1776
1779 1777 lookup_ok = 1;
1780 1778 at.va_mask = AT_MODE;
1781 1779 error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1782 1780 if (!error)
1783 1781 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1784 1782 VN_RELE(tvp);
1785 1783 tvp = NULL;
1786 1784 }
1787 1785 }
1788 1786
1789 1787 if (!lookup_ok) {
1790 1788 if (rdonly(exi, dvp, req)) {
1791 1789 error = EROFS;
1792 1790 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1793 1791 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1794 1792 error = EPERM;
1795 1793 } else {
1796 1794 error = 0;
1797 1795 }
1798 1796 }
1799 1797
1800 1798 /*
1801 1799 * If file size is being modified on an already existing file
1802 1800 * make sure that there are no conflicting non-blocking mandatory
1803 1801 * locks in the region being manipulated. Return EACCES if there
1804 1802 * are conflicting locks.
1805 1803 */
1806 1804 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1807 1805 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1808 1806 NULL, NULL, NULL);
1809 1807
1810 1808 if (!lookuperr &&
1811 1809 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1812 1810 VN_RELE(tvp);
1813 1811 curthread->t_flag |= T_WOULDBLOCK;
1814 1812 goto out;
1815 1813 }
1816 1814
1817 1815 if (!lookuperr && nbl_need_check(tvp)) {
1818 1816 /*
1819 1817 * The file exists. Now check if it has any
1820 1818 * conflicting non-blocking mandatory locks
1821 1819 * in the region being changed.
1822 1820 */
1823 1821 struct vattr bva;
1824 1822 u_offset_t offset;
1825 1823 ssize_t length;
1826 1824
1827 1825 nbl_start_crit(tvp, RW_READER);
1828 1826 in_crit = 1;
1829 1827
1830 1828 bva.va_mask = AT_SIZE;
1831 1829 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1832 1830 if (!error) {
1833 1831 if (va.va_size < bva.va_size) {
1834 1832 offset = va.va_size;
1835 1833 length = bva.va_size - va.va_size;
1836 1834 } else {
1837 1835 offset = bva.va_size;
1838 1836 length = va.va_size - bva.va_size;
1839 1837 }
1840 1838 if (length) {
1841 1839 if (nbl_conflict(tvp, NBL_WRITE,
1842 1840 offset, length, 0, NULL)) {
1843 1841 error = EACCES;
1844 1842 }
1845 1843 }
1846 1844 }
1847 1845 if (error) {
1848 1846 nbl_end_crit(tvp);
1849 1847 VN_RELE(tvp);
1850 1848 in_crit = 0;
1851 1849 }
1852 1850 } else if (tvp != NULL) {
1853 1851 VN_RELE(tvp);
1854 1852 }
1855 1853 }
1856 1854
1857 1855 if (!error) {
1858 1856 /*
1859 1857 * If filesystem is shared with nosuid the remove any
1860 1858 * setuid/setgid bits on create.
1861 1859 */
1862 1860 if (va.va_type == VREG &&
1863 1861 exi->exi_export.ex_flags & EX_NOSUID)
1864 1862 va.va_mode &= ~(VSUID | VSGID);
1865 1863
1866 1864 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1867 1865 NULL, NULL);
1868 1866
1869 1867 if (!error) {
1870 1868
1871 1869 if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1872 1870 trunc = TRUE;
1873 1871 else
1874 1872 trunc = FALSE;
1875 1873
1876 1874 if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1877 1875 VN_RELE(vp);
1878 1876 curthread->t_flag |= T_WOULDBLOCK;
1879 1877 goto out;
1880 1878 }
1881 1879 va.va_mask = AT_ALL;
1882 1880
1883 1881 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1884 1882
1885 1883 /* check for overflows */
1886 1884 if (!error) {
1887 1885 acl_perm(vp, exi, &va, cr);
1888 1886 error = vattr_to_nattr(&va, &dr->dr_attr);
1889 1887 if (!error) {
1890 1888 error = makefh(&dr->dr_fhandle, vp,
1891 1889 exi);
1892 1890 }
1893 1891 }
1894 1892 /*
1895 1893 * Force modified metadata out to stable storage.
1896 1894 *
1897 1895 * if a underlying vp exists, pass it to VOP_FSYNC
1898 1896 */
1899 1897 if (VOP_REALVP(vp, &realvp, NULL) == 0)
1900 1898 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1901 1899 else
1902 1900 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1903 1901 VN_RELE(vp);
1904 1902 }
1905 1903
1906 1904 if (in_crit) {
1907 1905 nbl_end_crit(tvp);
1908 1906 VN_RELE(tvp);
1909 1907 }
1910 1908 }
1911 1909
1912 1910 /*
1913 1911 * Force modified data and metadata out to stable storage.
1914 1912 */
1915 1913 (void) VOP_FSYNC(dvp, 0, cr, NULL);
1916 1914
1917 1915 out:
1918 1916
1919 1917 VN_RELE(dvp);
1920 1918
1921 1919 dr->dr_status = puterrno(error);
1922 1920
1923 1921 if (name != args->ca_da.da_name)
1924 1922 kmem_free(name, MAXPATHLEN);
1925 1923 }
1926 1924 void *
1927 1925 rfs_create_getfh(struct nfscreatargs *args)
1928 1926 {
1929 1927 return (args->ca_da.da_fhandle);
1930 1928 }
1931 1929
1932 1930 /*
1933 1931 * Remove a file.
1934 1932 * Remove named file from parent directory.
1935 1933 */
1936 1934 void
1937 1935 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1938 1936 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1939 1937 {
1940 1938 int error = 0;
1941 1939 vnode_t *vp;
1942 1940 vnode_t *targvp;
1943 1941 int in_crit = 0;
1944 1942
1945 1943 /*
1946 1944 * Disallow NULL paths
1947 1945 */
1948 1946 if (da->da_name == NULL || *da->da_name == '\0') {
1949 1947 *status = NFSERR_ACCES;
1950 1948 return;
1951 1949 }
1952 1950
1953 1951 vp = nfs_fhtovp(da->da_fhandle, exi);
1954 1952 if (vp == NULL) {
1955 1953 *status = NFSERR_STALE;
1956 1954 return;
1957 1955 }
1958 1956
1959 1957 if (rdonly(exi, vp, req)) {
1960 1958 VN_RELE(vp);
1961 1959 *status = NFSERR_ROFS;
1962 1960 return;
1963 1961 }
1964 1962
1965 1963 /*
1966 1964 * Check for a conflict with a non-blocking mandatory share reservation.
1967 1965 */
1968 1966 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1969 1967 NULL, cr, NULL, NULL, NULL);
1970 1968 if (error != 0) {
1971 1969 VN_RELE(vp);
1972 1970 *status = puterrno(error);
1973 1971 return;
1974 1972 }
1975 1973
1976 1974 /*
1977 1975 * If the file is delegated to an v4 client, then initiate
1978 1976 * recall and drop this request (by setting T_WOULDBLOCK).
1979 1977 * The client will eventually re-transmit the request and
1980 1978 * (hopefully), by then, the v4 client will have returned
1981 1979 * the delegation.
1982 1980 */
1983 1981
1984 1982 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1985 1983 VN_RELE(vp);
1986 1984 VN_RELE(targvp);
1987 1985 curthread->t_flag |= T_WOULDBLOCK;
1988 1986 return;
1989 1987 }
1990 1988
1991 1989 if (nbl_need_check(targvp)) {
1992 1990 nbl_start_crit(targvp, RW_READER);
1993 1991 in_crit = 1;
1994 1992 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1995 1993 error = EACCES;
1996 1994 goto out;
1997 1995 }
1998 1996 }
1999 1997
2000 1998 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2001 1999
2002 2000 /*
2003 2001 * Force modified data and metadata out to stable storage.
2004 2002 */
2005 2003 (void) VOP_FSYNC(vp, 0, cr, NULL);
2006 2004
2007 2005 out:
2008 2006 if (in_crit)
2009 2007 nbl_end_crit(targvp);
2010 2008 VN_RELE(targvp);
2011 2009 VN_RELE(vp);
2012 2010
2013 2011 *status = puterrno(error);
2014 2012
2015 2013 }
2016 2014
2017 2015 void *
2018 2016 rfs_remove_getfh(struct nfsdiropargs *da)
2019 2017 {
2020 2018 return (da->da_fhandle);
2021 2019 }
2022 2020
2023 2021 /*
2024 2022 * rename a file
2025 2023 * Give a file (from) a new name (to).
2026 2024 */
2027 2025 void
2028 2026 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2029 2027 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2030 2028 {
2031 2029 int error = 0;
2032 2030 vnode_t *fromvp;
2033 2031 vnode_t *tovp;
2034 2032 struct exportinfo *to_exi;
2035 2033 fhandle_t *fh;
2036 2034 vnode_t *srcvp;
2037 2035 vnode_t *targvp;
2038 2036 int in_crit = 0;
2039 2037
2040 2038 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2041 2039 if (fromvp == NULL) {
2042 2040 *status = NFSERR_STALE;
2043 2041 return;
2044 2042 }
2045 2043
2046 2044 fh = args->rna_to.da_fhandle;
2047 2045 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2048 2046 if (to_exi == NULL) {
2049 2047 VN_RELE(fromvp);
2050 2048 *status = NFSERR_ACCES;
2051 2049 return;
2052 2050 }
2053 2051 exi_rele(to_exi);
2054 2052
2055 2053 if (to_exi != exi) {
2056 2054 VN_RELE(fromvp);
2057 2055 *status = NFSERR_XDEV;
2058 2056 return;
2059 2057 }
2060 2058
2061 2059 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2062 2060 if (tovp == NULL) {
2063 2061 VN_RELE(fromvp);
2064 2062 *status = NFSERR_STALE;
2065 2063 return;
2066 2064 }
2067 2065
2068 2066 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2069 2067 VN_RELE(tovp);
2070 2068 VN_RELE(fromvp);
2071 2069 *status = NFSERR_NOTDIR;
2072 2070 return;
2073 2071 }
2074 2072
2075 2073 /*
2076 2074 * Disallow NULL paths
2077 2075 */
2078 2076 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2079 2077 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2080 2078 VN_RELE(tovp);
2081 2079 VN_RELE(fromvp);
2082 2080 *status = NFSERR_ACCES;
2083 2081 return;
2084 2082 }
2085 2083
2086 2084 if (rdonly(exi, tovp, req)) {
2087 2085 VN_RELE(tovp);
2088 2086 VN_RELE(fromvp);
2089 2087 *status = NFSERR_ROFS;
2090 2088 return;
2091 2089 }
2092 2090
2093 2091 /*
2094 2092 * Check for a conflict with a non-blocking mandatory share reservation.
2095 2093 */
2096 2094 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2097 2095 NULL, cr, NULL, NULL, NULL);
2098 2096 if (error != 0) {
2099 2097 VN_RELE(tovp);
2100 2098 VN_RELE(fromvp);
2101 2099 *status = puterrno(error);
2102 2100 return;
2103 2101 }
2104 2102
2105 2103 /* Check for delegations on the source file */
2106 2104
2107 2105 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2108 2106 VN_RELE(tovp);
2109 2107 VN_RELE(fromvp);
2110 2108 VN_RELE(srcvp);
2111 2109 curthread->t_flag |= T_WOULDBLOCK;
2112 2110 return;
2113 2111 }
2114 2112
2115 2113 /* Check for delegation on the file being renamed over, if it exists */
2116 2114
2117 2115 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2118 2116 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2119 2117 NULL, NULL, NULL) == 0) {
2120 2118
2121 2119 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2122 2120 VN_RELE(tovp);
2123 2121 VN_RELE(fromvp);
2124 2122 VN_RELE(srcvp);
2125 2123 VN_RELE(targvp);
2126 2124 curthread->t_flag |= T_WOULDBLOCK;
2127 2125 return;
2128 2126 }
2129 2127 VN_RELE(targvp);
2130 2128 }
2131 2129
2132 2130
2133 2131 if (nbl_need_check(srcvp)) {
2134 2132 nbl_start_crit(srcvp, RW_READER);
2135 2133 in_crit = 1;
2136 2134 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2137 2135 error = EACCES;
2138 2136 goto out;
2139 2137 }
2140 2138 }
2141 2139
2142 2140 error = VOP_RENAME(fromvp, args->rna_from.da_name,
2143 2141 tovp, args->rna_to.da_name, cr, NULL, 0);
2144 2142
2145 2143 if (error == 0)
2146 2144 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2147 2145 strlen(args->rna_to.da_name));
2148 2146
2149 2147 /*
2150 2148 * Force modified data and metadata out to stable storage.
2151 2149 */
2152 2150 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2153 2151 (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2154 2152
2155 2153 out:
2156 2154 if (in_crit)
2157 2155 nbl_end_crit(srcvp);
2158 2156 VN_RELE(srcvp);
2159 2157 VN_RELE(tovp);
2160 2158 VN_RELE(fromvp);
2161 2159
2162 2160 *status = puterrno(error);
2163 2161
2164 2162 }
2165 2163 void *
2166 2164 rfs_rename_getfh(struct nfsrnmargs *args)
2167 2165 {
2168 2166 return (args->rna_from.da_fhandle);
2169 2167 }
2170 2168
2171 2169 /*
2172 2170 * Link to a file.
2173 2171 * Create a file (to) which is a hard link to the given file (from).
2174 2172 */
2175 2173 void
2176 2174 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2177 2175 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2178 2176 {
2179 2177 int error;
2180 2178 vnode_t *fromvp;
2181 2179 vnode_t *tovp;
2182 2180 struct exportinfo *to_exi;
2183 2181 fhandle_t *fh;
2184 2182
2185 2183 fromvp = nfs_fhtovp(args->la_from, exi);
2186 2184 if (fromvp == NULL) {
2187 2185 *status = NFSERR_STALE;
2188 2186 return;
2189 2187 }
2190 2188
2191 2189 fh = args->la_to.da_fhandle;
2192 2190 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2193 2191 if (to_exi == NULL) {
2194 2192 VN_RELE(fromvp);
2195 2193 *status = NFSERR_ACCES;
2196 2194 return;
2197 2195 }
2198 2196 exi_rele(to_exi);
2199 2197
2200 2198 if (to_exi != exi) {
2201 2199 VN_RELE(fromvp);
2202 2200 *status = NFSERR_XDEV;
2203 2201 return;
2204 2202 }
2205 2203
2206 2204 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2207 2205 if (tovp == NULL) {
2208 2206 VN_RELE(fromvp);
2209 2207 *status = NFSERR_STALE;
2210 2208 return;
2211 2209 }
2212 2210
2213 2211 if (tovp->v_type != VDIR) {
2214 2212 VN_RELE(tovp);
2215 2213 VN_RELE(fromvp);
2216 2214 *status = NFSERR_NOTDIR;
2217 2215 return;
2218 2216 }
2219 2217 /*
2220 2218 * Disallow NULL paths
2221 2219 */
2222 2220 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2223 2221 VN_RELE(tovp);
2224 2222 VN_RELE(fromvp);
2225 2223 *status = NFSERR_ACCES;
2226 2224 return;
2227 2225 }
2228 2226
2229 2227 if (rdonly(exi, tovp, req)) {
2230 2228 VN_RELE(tovp);
2231 2229 VN_RELE(fromvp);
2232 2230 *status = NFSERR_ROFS;
2233 2231 return;
2234 2232 }
2235 2233
2236 2234 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2237 2235
2238 2236 /*
2239 2237 * Force modified data and metadata out to stable storage.
2240 2238 */
2241 2239 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2242 2240 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2243 2241
2244 2242 VN_RELE(tovp);
2245 2243 VN_RELE(fromvp);
2246 2244
2247 2245 *status = puterrno(error);
2248 2246
2249 2247 }
2250 2248 void *
2251 2249 rfs_link_getfh(struct nfslinkargs *args)
2252 2250 {
2253 2251 return (args->la_from);
2254 2252 }
2255 2253
2256 2254 /*
2257 2255 * Symbolicly link to a file.
2258 2256 * Create a file (to) with the given attributes which is a symbolic link
2259 2257 * to the given path name (to).
2260 2258 */
2261 2259 void
2262 2260 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2263 2261 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2264 2262 {
2265 2263 int error;
2266 2264 struct vattr va;
2267 2265 vnode_t *vp;
2268 2266 vnode_t *svp;
2269 2267 int lerror;
2270 2268 struct sockaddr *ca;
2271 2269 char *name = NULL;
2272 2270
2273 2271 /*
2274 2272 * Disallow NULL paths
2275 2273 */
2276 2274 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2277 2275 *status = NFSERR_ACCES;
2278 2276 return;
2279 2277 }
2280 2278
2281 2279 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2282 2280 if (vp == NULL) {
2283 2281 *status = NFSERR_STALE;
2284 2282 return;
2285 2283 }
2286 2284
2287 2285 if (rdonly(exi, vp, req)) {
2288 2286 VN_RELE(vp);
2289 2287 *status = NFSERR_ROFS;
2290 2288 return;
2291 2289 }
2292 2290
2293 2291 error = sattr_to_vattr(args->sla_sa, &va);
2294 2292 if (error) {
2295 2293 VN_RELE(vp);
2296 2294 *status = puterrno(error);
2297 2295 return;
2298 2296 }
2299 2297
2300 2298 if (!(va.va_mask & AT_MODE)) {
2301 2299 VN_RELE(vp);
2302 2300 *status = NFSERR_INVAL;
2303 2301 return;
2304 2302 }
2305 2303
2306 2304 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2307 2305 name = nfscmd_convname(ca, exi, args->sla_tnm,
2308 2306 NFSCMD_CONV_INBOUND, MAXPATHLEN);
2309 2307
2310 2308 if (name == NULL) {
2311 2309 *status = NFSERR_ACCES;
2312 2310 return;
2313 2311 }
2314 2312
2315 2313 va.va_type = VLNK;
2316 2314 va.va_mask |= AT_TYPE;
2317 2315
2318 2316 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2319 2317
2320 2318 /*
2321 2319 * Force new data and metadata out to stable storage.
2322 2320 */
2323 2321 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2324 2322 NULL, cr, NULL, NULL, NULL);
2325 2323
2326 2324 if (!lerror) {
2327 2325 (void) VOP_FSYNC(svp, 0, cr, NULL);
2328 2326 VN_RELE(svp);
2329 2327 }
2330 2328
2331 2329 /*
2332 2330 * Force modified data and metadata out to stable storage.
2333 2331 */
2334 2332 (void) VOP_FSYNC(vp, 0, cr, NULL);
2335 2333
2336 2334 VN_RELE(vp);
2337 2335
2338 2336 *status = puterrno(error);
2339 2337 if (name != args->sla_tnm)
2340 2338 kmem_free(name, MAXPATHLEN);
2341 2339
2342 2340 }
2343 2341 void *
2344 2342 rfs_symlink_getfh(struct nfsslargs *args)
2345 2343 {
2346 2344 return (args->sla_from.da_fhandle);
2347 2345 }
2348 2346
2349 2347 /*
2350 2348 * Make a directory.
2351 2349 * Create a directory with the given name, parent directory, and attributes.
2352 2350 * Returns a file handle and attributes for the new directory.
2353 2351 */
2354 2352 void
2355 2353 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2356 2354 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2357 2355 {
2358 2356 int error;
2359 2357 struct vattr va;
2360 2358 vnode_t *dvp = NULL;
2361 2359 vnode_t *vp;
2362 2360 char *name = args->ca_da.da_name;
2363 2361
2364 2362 /*
2365 2363 * Disallow NULL paths
2366 2364 */
2367 2365 if (name == NULL || *name == '\0') {
2368 2366 dr->dr_status = NFSERR_ACCES;
2369 2367 return;
2370 2368 }
2371 2369
2372 2370 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2373 2371 if (vp == NULL) {
2374 2372 dr->dr_status = NFSERR_STALE;
2375 2373 return;
2376 2374 }
2377 2375
2378 2376 if (rdonly(exi, vp, req)) {
2379 2377 VN_RELE(vp);
2380 2378 dr->dr_status = NFSERR_ROFS;
2381 2379 return;
2382 2380 }
2383 2381
2384 2382 error = sattr_to_vattr(args->ca_sa, &va);
2385 2383 if (error) {
2386 2384 VN_RELE(vp);
2387 2385 dr->dr_status = puterrno(error);
2388 2386 return;
2389 2387 }
2390 2388
2391 2389 if (!(va.va_mask & AT_MODE)) {
2392 2390 VN_RELE(vp);
2393 2391 dr->dr_status = NFSERR_INVAL;
2394 2392 return;
2395 2393 }
2396 2394
2397 2395 va.va_type = VDIR;
2398 2396 va.va_mask |= AT_TYPE;
2399 2397
2400 2398 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2401 2399
2402 2400 if (!error) {
2403 2401 /*
2404 2402 * Attribtutes of the newly created directory should
2405 2403 * be returned to the client.
2406 2404 */
2407 2405 va.va_mask = AT_ALL; /* We want everything */
2408 2406 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2409 2407
2410 2408 /* check for overflows */
2411 2409 if (!error) {
2412 2410 acl_perm(vp, exi, &va, cr);
2413 2411 error = vattr_to_nattr(&va, &dr->dr_attr);
2414 2412 if (!error) {
2415 2413 error = makefh(&dr->dr_fhandle, dvp, exi);
2416 2414 }
2417 2415 }
2418 2416 /*
2419 2417 * Force new data and metadata out to stable storage.
2420 2418 */
2421 2419 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2422 2420 VN_RELE(dvp);
2423 2421 }
2424 2422
2425 2423 /*
2426 2424 * Force modified data and metadata out to stable storage.
2427 2425 */
2428 2426 (void) VOP_FSYNC(vp, 0, cr, NULL);
2429 2427
2430 2428 VN_RELE(vp);
2431 2429
2432 2430 dr->dr_status = puterrno(error);
2433 2431
2434 2432 }
2435 2433 void *
2436 2434 rfs_mkdir_getfh(struct nfscreatargs *args)
2437 2435 {
2438 2436 return (args->ca_da.da_fhandle);
2439 2437 }
2440 2438
2441 2439 /*
2442 2440 * Remove a directory.
2443 2441 * Remove the given directory name from the given parent directory.
2444 2442 */
2445 2443 void
2446 2444 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2447 2445 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2448 2446 {
2449 2447 int error;
2450 2448 vnode_t *vp;
2451 2449
2452 2450 /*
2453 2451 * Disallow NULL paths
2454 2452 */
2455 2453 if (da->da_name == NULL || *da->da_name == '\0') {
2456 2454 *status = NFSERR_ACCES;
2457 2455 return;
2458 2456 }
2459 2457
2460 2458 vp = nfs_fhtovp(da->da_fhandle, exi);
2461 2459 if (vp == NULL) {
2462 2460 *status = NFSERR_STALE;
2463 2461 return;
2464 2462 }
2465 2463
2466 2464 if (rdonly(exi, vp, req)) {
2467 2465 VN_RELE(vp);
2468 2466 *status = NFSERR_ROFS;
2469 2467 return;
2470 2468 }
2471 2469
2472 2470 /*
2473 2471 * VOP_RMDIR takes a third argument (the current
2474 2472 * directory of the process). That's because someone
2475 2473 * wants to return EINVAL if one tries to remove ".".
2476 2474 * Of course, NFS servers have no idea what their
2477 2475 * clients' current directories are. We fake it by
2478 2476 * supplying a vnode known to exist and illegal to
2479 2477 * remove.
2480 2478 */
2481 2479 error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2482 2480
2483 2481 /*
2484 2482 * Force modified data and metadata out to stable storage.
2485 2483 */
2486 2484 (void) VOP_FSYNC(vp, 0, cr, NULL);
2487 2485
2488 2486 VN_RELE(vp);
2489 2487
2490 2488 /*
2491 2489 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2492 2490 * if the directory is not empty. A System V NFS server
2493 2491 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2494 2492 * over the wire.
2495 2493 */
2496 2494 if (error == EEXIST)
2497 2495 *status = NFSERR_NOTEMPTY;
2498 2496 else
2499 2497 *status = puterrno(error);
2500 2498
2501 2499 }
2502 2500 void *
2503 2501 rfs_rmdir_getfh(struct nfsdiropargs *da)
2504 2502 {
2505 2503 return (da->da_fhandle);
2506 2504 }
2507 2505
2508 2506 /* ARGSUSED */
2509 2507 void
2510 2508 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2511 2509 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2512 2510 {
2513 2511 int error;
2514 2512 int iseof;
2515 2513 struct iovec iov;
2516 2514 struct uio uio;
2517 2515 vnode_t *vp;
2518 2516 char *ndata = NULL;
2519 2517 struct sockaddr *ca;
2520 2518 size_t nents;
2521 2519 int ret;
2522 2520
2523 2521 vp = nfs_fhtovp(&rda->rda_fh, exi);
2524 2522 if (vp == NULL) {
2525 2523 rd->rd_entries = NULL;
2526 2524 rd->rd_status = NFSERR_STALE;
2527 2525 return;
2528 2526 }
2529 2527
2530 2528 if (vp->v_type != VDIR) {
2531 2529 VN_RELE(vp);
2532 2530 rd->rd_entries = NULL;
2533 2531 rd->rd_status = NFSERR_NOTDIR;
2534 2532 return;
2535 2533 }
2536 2534
2537 2535 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2538 2536
2539 2537 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2540 2538
2541 2539 if (error) {
2542 2540 rd->rd_entries = NULL;
2543 2541 goto bad;
2544 2542 }
2545 2543
2546 2544 if (rda->rda_count == 0) {
2547 2545 rd->rd_entries = NULL;
2548 2546 rd->rd_size = 0;
2549 2547 rd->rd_eof = FALSE;
2550 2548 goto bad;
2551 2549 }
2552 2550
2553 2551 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2554 2552
2555 2553 /*
2556 2554 * Allocate data for entries. This will be freed by rfs_rddirfree.
2557 2555 */
2558 2556 rd->rd_bufsize = (uint_t)rda->rda_count;
2559 2557 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2560 2558
2561 2559 /*
2562 2560 * Set up io vector to read directory data
2563 2561 */
2564 2562 iov.iov_base = (caddr_t)rd->rd_entries;
2565 2563 iov.iov_len = rda->rda_count;
2566 2564 uio.uio_iov = &iov;
2567 2565 uio.uio_iovcnt = 1;
2568 2566 uio.uio_segflg = UIO_SYSSPACE;
2569 2567 uio.uio_extflg = UIO_COPY_CACHED;
2570 2568 uio.uio_loffset = (offset_t)rda->rda_offset;
2571 2569 uio.uio_resid = rda->rda_count;
2572 2570
2573 2571 /*
2574 2572 * read directory
2575 2573 */
2576 2574 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2577 2575
2578 2576 /*
2579 2577 * Clean up
2580 2578 */
2581 2579 if (!error) {
2582 2580 /*
2583 2581 * set size and eof
2584 2582 */
2585 2583 if (uio.uio_resid == rda->rda_count) {
2586 2584 rd->rd_size = 0;
2587 2585 rd->rd_eof = TRUE;
2588 2586 } else {
2589 2587 rd->rd_size = (uint32_t)(rda->rda_count -
2590 2588 uio.uio_resid);
2591 2589 rd->rd_eof = iseof ? TRUE : FALSE;
2592 2590 }
2593 2591 }
2594 2592
2595 2593 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2596 2594 nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2597 2595 ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2598 2596 rda->rda_count, &ndata);
2599 2597
2600 2598 if (ret != 0) {
2601 2599 size_t dropbytes;
2602 2600 /*
2603 2601 * We had to drop one or more entries in order to fit
2604 2602 * during the character conversion. We need to patch
2605 2603 * up the size and eof info.
2606 2604 */
2607 2605 if (rd->rd_eof)
2608 2606 rd->rd_eof = FALSE;
2609 2607 dropbytes = nfscmd_dropped_entrysize(
2610 2608 (struct dirent64 *)rd->rd_entries, nents, ret);
2611 2609 rd->rd_size -= dropbytes;
2612 2610 }
2613 2611 if (ndata == NULL) {
2614 2612 ndata = (char *)rd->rd_entries;
2615 2613 } else if (ndata != (char *)rd->rd_entries) {
2616 2614 kmem_free(rd->rd_entries, rd->rd_bufsize);
2617 2615 rd->rd_entries = (void *)ndata;
2618 2616 rd->rd_bufsize = rda->rda_count;
2619 2617 }
2620 2618
2621 2619 bad:
2622 2620 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2623 2621
2624 2622 #if 0 /* notyet */
2625 2623 /*
2626 2624 * Don't do this. It causes local disk writes when just
2627 2625 * reading the file and the overhead is deemed larger
2628 2626 * than the benefit.
2629 2627 */
2630 2628 /*
2631 2629 * Force modified metadata out to stable storage.
2632 2630 */
2633 2631 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2634 2632 #endif
2635 2633
2636 2634 VN_RELE(vp);
2637 2635
2638 2636 rd->rd_status = puterrno(error);
2639 2637
2640 2638 }
2641 2639 void *
2642 2640 rfs_readdir_getfh(struct nfsrddirargs *rda)
2643 2641 {
2644 2642 return (&rda->rda_fh);
2645 2643 }
2646 2644 void
2647 2645 rfs_rddirfree(struct nfsrddirres *rd)
2648 2646 {
2649 2647 if (rd->rd_entries != NULL)
2650 2648 kmem_free(rd->rd_entries, rd->rd_bufsize);
2651 2649 }
2652 2650
2653 2651 /* ARGSUSED */
2654 2652 void
2655 2653 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2656 2654 struct svc_req *req, cred_t *cr)
2657 2655 {
2658 2656 int error;
2659 2657 struct statvfs64 sb;
2660 2658 vnode_t *vp;
2661 2659
2662 2660 vp = nfs_fhtovp(fh, exi);
2663 2661 if (vp == NULL) {
2664 2662 fs->fs_status = NFSERR_STALE;
2665 2663 return;
2666 2664 }
2667 2665
2668 2666 error = VFS_STATVFS(vp->v_vfsp, &sb);
2669 2667
2670 2668 if (!error) {
2671 2669 fs->fs_tsize = nfstsize();
2672 2670 fs->fs_bsize = sb.f_frsize;
2673 2671 fs->fs_blocks = sb.f_blocks;
2674 2672 fs->fs_bfree = sb.f_bfree;
2675 2673 fs->fs_bavail = sb.f_bavail;
2676 2674 }
2677 2675
2678 2676 VN_RELE(vp);
2679 2677
2680 2678 fs->fs_status = puterrno(error);
2681 2679
2682 2680 }
2683 2681 void *
2684 2682 rfs_statfs_getfh(fhandle_t *fh)
2685 2683 {
2686 2684 return (fh);
2687 2685 }
2688 2686
2689 2687 static int
2690 2688 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2691 2689 {
2692 2690 vap->va_mask = 0;
2693 2691
2694 2692 /*
2695 2693 * There was a sign extension bug in some VFS based systems
2696 2694 * which stored the mode as a short. When it would get
2697 2695 * assigned to a u_long, no sign extension would occur.
2698 2696 * It needed to, but this wasn't noticed because sa_mode
2699 2697 * would then get assigned back to the short, thus ignoring
2700 2698 * the upper 16 bits of sa_mode.
2701 2699 *
2702 2700 * To make this implementation work for both broken
2703 2701 * clients and good clients, we check for both versions
2704 2702 * of the mode.
2705 2703 */
2706 2704 if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2707 2705 sa->sa_mode != (uint32_t)-1) {
2708 2706 vap->va_mask |= AT_MODE;
2709 2707 vap->va_mode = sa->sa_mode;
2710 2708 }
2711 2709 if (sa->sa_uid != (uint32_t)-1) {
2712 2710 vap->va_mask |= AT_UID;
2713 2711 vap->va_uid = sa->sa_uid;
2714 2712 }
2715 2713 if (sa->sa_gid != (uint32_t)-1) {
2716 2714 vap->va_mask |= AT_GID;
2717 2715 vap->va_gid = sa->sa_gid;
2718 2716 }
2719 2717 if (sa->sa_size != (uint32_t)-1) {
2720 2718 vap->va_mask |= AT_SIZE;
2721 2719 vap->va_size = sa->sa_size;
2722 2720 }
2723 2721 if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2724 2722 sa->sa_atime.tv_usec != (int32_t)-1) {
2725 2723 #ifndef _LP64
2726 2724 /* return error if time overflow */
2727 2725 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2728 2726 return (EOVERFLOW);
2729 2727 #endif
2730 2728 vap->va_mask |= AT_ATIME;
2731 2729 /*
2732 2730 * nfs protocol defines times as unsigned so don't extend sign,
2733 2731 * unless sysadmin set nfs_allow_preepoch_time.
2734 2732 */
2735 2733 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2736 2734 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2737 2735 }
2738 2736 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2739 2737 sa->sa_mtime.tv_usec != (int32_t)-1) {
2740 2738 #ifndef _LP64
2741 2739 /* return error if time overflow */
2742 2740 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2743 2741 return (EOVERFLOW);
2744 2742 #endif
2745 2743 vap->va_mask |= AT_MTIME;
2746 2744 /*
2747 2745 * nfs protocol defines times as unsigned so don't extend sign,
2748 2746 * unless sysadmin set nfs_allow_preepoch_time.
2749 2747 */
2750 2748 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2751 2749 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2752 2750 }
2753 2751 return (0);
2754 2752 }
2755 2753
2756 2754 static enum nfsftype vt_to_nf[] = {
2757 2755 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2758 2756 };
2759 2757
2760 2758 /*
2761 2759 * check the following fields for overflow: nodeid, size, and time.
2762 2760 * There could be a problem when converting 64-bit LP64 fields
2763 2761 * into 32-bit ones. Return an error if there is an overflow.
2764 2762 */
2765 2763 int
2766 2764 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2767 2765 {
2768 2766 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2769 2767 na->na_type = vt_to_nf[vap->va_type];
2770 2768
2771 2769 if (vap->va_mode == (unsigned short) -1)
2772 2770 na->na_mode = (uint32_t)-1;
2773 2771 else
2774 2772 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2775 2773
2776 2774 if (vap->va_uid == (unsigned short)(-1))
2777 2775 na->na_uid = (uint32_t)(-1);
2778 2776 else if (vap->va_uid == UID_NOBODY)
2779 2777 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2780 2778 else
2781 2779 na->na_uid = vap->va_uid;
2782 2780
2783 2781 if (vap->va_gid == (unsigned short)(-1))
2784 2782 na->na_gid = (uint32_t)-1;
2785 2783 else if (vap->va_gid == GID_NOBODY)
2786 2784 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2787 2785 else
2788 2786 na->na_gid = vap->va_gid;
2789 2787
2790 2788 /*
2791 2789 * Do we need to check fsid for overflow? It is 64-bit in the
2792 2790 * vattr, but are bigger than 32 bit values supported?
2793 2791 */
2794 2792 na->na_fsid = vap->va_fsid;
2795 2793
2796 2794 na->na_nodeid = vap->va_nodeid;
2797 2795
2798 2796 /*
2799 2797 * Check to make sure that the nodeid is representable over the
2800 2798 * wire without losing bits.
2801 2799 */
2802 2800 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2803 2801 return (EFBIG);
2804 2802 na->na_nlink = vap->va_nlink;
2805 2803
2806 2804 /*
2807 2805 * Check for big files here, instead of at the caller. See
2808 2806 * comments in cstat for large special file explanation.
2809 2807 */
2810 2808 if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2811 2809 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2812 2810 return (EFBIG);
2813 2811 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2814 2812 /* UNKNOWN_SIZE | OVERFLOW */
2815 2813 na->na_size = MAXOFF32_T;
2816 2814 } else
2817 2815 na->na_size = vap->va_size;
2818 2816 } else
2819 2817 na->na_size = vap->va_size;
2820 2818
2821 2819 /*
2822 2820 * If the vnode times overflow the 32-bit times that NFS2
2823 2821 * uses on the wire then return an error.
2824 2822 */
2825 2823 if (!NFS_VAP_TIME_OK(vap)) {
2826 2824 return (EOVERFLOW);
2827 2825 }
2828 2826 na->na_atime.tv_sec = vap->va_atime.tv_sec;
2829 2827 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2830 2828
2831 2829 na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2832 2830 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2833 2831
2834 2832 na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2835 2833 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2836 2834
2837 2835 /*
2838 2836 * If the dev_t will fit into 16 bits then compress
2839 2837 * it, otherwise leave it alone. See comments in
2840 2838 * nfs_client.c.
2841 2839 */
2842 2840 if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2843 2841 getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2844 2842 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2845 2843 else
2846 2844 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2847 2845
2848 2846 na->na_blocks = vap->va_nblocks;
2849 2847 na->na_blocksize = vap->va_blksize;
2850 2848
2851 2849 /*
2852 2850 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2853 2851 * over-the-wire protocols for named-pipe vnodes. It remaps the
2854 2852 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2855 2853 *
2856 2854 * BUYER BEWARE:
2857 2855 * If you are porting the NFS to a non-Sun server, you probably
2858 2856 * don't want to include the following block of code. The
2859 2857 * over-the-wire special file types will be changing with the
2860 2858 * NFS Protocol Revision.
2861 2859 */
2862 2860 if (vap->va_type == VFIFO)
2863 2861 NA_SETFIFO(na);
2864 2862 return (0);
2865 2863 }
2866 2864
2867 2865 /*
2868 2866 * acl v2 support: returns approximate permission.
2869 2867 * default: returns minimal permission (more restrictive)
2870 2868 * aclok: returns maximal permission (less restrictive)
2871 2869 * This routine changes the permissions that are alaredy in *va.
2872 2870 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2873 2871 * CLASS_OBJ is always the same as GROUP_OBJ entry.
2874 2872 */
2875 2873 static void
2876 2874 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2877 2875 {
2878 2876 vsecattr_t vsa;
2879 2877 int aclcnt;
2880 2878 aclent_t *aclentp;
2881 2879 mode_t mask_perm;
2882 2880 mode_t grp_perm;
2883 2881 mode_t other_perm;
2884 2882 mode_t other_orig;
2885 2883 int error;
2886 2884
2887 2885 /* dont care default acl */
2888 2886 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2889 2887 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2890 2888
2891 2889 if (!error) {
2892 2890 aclcnt = vsa.vsa_aclcnt;
2893 2891 if (aclcnt > MIN_ACL_ENTRIES) {
2894 2892 /* non-trivial ACL */
2895 2893 aclentp = vsa.vsa_aclentp;
2896 2894 if (exi->exi_export.ex_flags & EX_ACLOK) {
2897 2895 /* maximal permissions */
2898 2896 grp_perm = 0;
2899 2897 other_perm = 0;
2900 2898 for (; aclcnt > 0; aclcnt--, aclentp++) {
2901 2899 switch (aclentp->a_type) {
2902 2900 case USER_OBJ:
2903 2901 break;
2904 2902 case USER:
2905 2903 grp_perm |=
2906 2904 aclentp->a_perm << 3;
2907 2905 other_perm |= aclentp->a_perm;
2908 2906 break;
2909 2907 case GROUP_OBJ:
2910 2908 grp_perm |=
2911 2909 aclentp->a_perm << 3;
2912 2910 break;
2913 2911 case GROUP:
2914 2912 other_perm |= aclentp->a_perm;
2915 2913 break;
2916 2914 case OTHER_OBJ:
2917 2915 other_orig = aclentp->a_perm;
2918 2916 break;
2919 2917 case CLASS_OBJ:
2920 2918 mask_perm = aclentp->a_perm;
2921 2919 break;
2922 2920 default:
2923 2921 break;
2924 2922 }
2925 2923 }
2926 2924 grp_perm &= mask_perm << 3;
2927 2925 other_perm &= mask_perm;
2928 2926 other_perm |= other_orig;
2929 2927
2930 2928 } else {
2931 2929 /* minimal permissions */
2932 2930 grp_perm = 070;
2933 2931 other_perm = 07;
2934 2932 for (; aclcnt > 0; aclcnt--, aclentp++) {
2935 2933 switch (aclentp->a_type) {
2936 2934 case USER_OBJ:
2937 2935 break;
2938 2936 case USER:
2939 2937 case CLASS_OBJ:
2940 2938 grp_perm &=
2941 2939 aclentp->a_perm << 3;
2942 2940 other_perm &=
2943 2941 aclentp->a_perm;
2944 2942 break;
2945 2943 case GROUP_OBJ:
2946 2944 grp_perm &=
2947 2945 aclentp->a_perm << 3;
2948 2946 break;
2949 2947 case GROUP:
2950 2948 other_perm &=
2951 2949 aclentp->a_perm;
2952 2950 break;
2953 2951 case OTHER_OBJ:
2954 2952 other_perm &=
2955 2953 aclentp->a_perm;
2956 2954 break;
2957 2955 default:
2958 2956 break;
2959 2957 }
2960 2958 }
2961 2959 }
2962 2960 /* copy to va */
2963 2961 va->va_mode &= ~077;
2964 2962 va->va_mode |= grp_perm | other_perm;
2965 2963 }
2966 2964 if (vsa.vsa_aclcnt)
2967 2965 kmem_free(vsa.vsa_aclentp,
2968 2966 vsa.vsa_aclcnt * sizeof (aclent_t));
2969 2967 }
2970 2968 }
2971 2969
2972 2970 void
2973 2971 rfs_srvrinit(void)
2974 2972 {
2975 2973 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2976 2974 nfs2_srv_caller_id = fs_new_caller_id();
2977 2975 }
2978 2976
2979 2977 void
2980 2978 rfs_srvrfini(void)
2981 2979 {
2982 2980 mutex_destroy(&rfs_async_write_lock);
2983 2981 }
2984 2982
2985 2983 static int
2986 2984 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2987 2985 {
2988 2986 struct clist *wcl;
2989 2987 int wlist_len;
2990 2988 uint32_t count = rr->rr_count;
2991 2989
2992 2990 wcl = ra->ra_wlist;
2993 2991
2994 2992 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
2995 2993 return (FALSE);
2996 2994 }
2997 2995
2998 2996 wcl = ra->ra_wlist;
2999 2997 rr->rr_ok.rrok_wlist_len = wlist_len;
3000 2998 rr->rr_ok.rrok_wlist = wcl;
3001 2999
3002 3000 return (TRUE);
3003 3001 }
↓ open down ↓ |
1778 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX