Print this page
6583 remove whole-process swapping
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs_srv.c
+++ new/usr/src/uts/common/fs/nfs/nfs_srv.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28 28 * All rights reserved.
29 29 */
30 30
31 31 #include <sys/param.h>
32 32 #include <sys/types.h>
33 33 #include <sys/systm.h>
34 34 #include <sys/cred.h>
35 35 #include <sys/buf.h>
36 36 #include <sys/vfs.h>
37 37 #include <sys/vnode.h>
38 38 #include <sys/uio.h>
39 39 #include <sys/stat.h>
40 40 #include <sys/errno.h>
41 41 #include <sys/sysmacros.h>
42 42 #include <sys/statvfs.h>
43 43 #include <sys/kmem.h>
44 44 #include <sys/kstat.h>
45 45 #include <sys/dirent.h>
46 46 #include <sys/cmn_err.h>
47 47 #include <sys/debug.h>
48 48 #include <sys/vtrace.h>
49 49 #include <sys/mode.h>
50 50 #include <sys/acl.h>
51 51 #include <sys/nbmlock.h>
52 52 #include <sys/policy.h>
53 53 #include <sys/sdt.h>
54 54
55 55 #include <rpc/types.h>
56 56 #include <rpc/auth.h>
57 57 #include <rpc/svc.h>
58 58
59 59 #include <nfs/nfs.h>
60 60 #include <nfs/export.h>
61 61 #include <nfs/nfs_cmd.h>
62 62
63 63 #include <vm/hat.h>
64 64 #include <vm/as.h>
65 65 #include <vm/seg.h>
66 66 #include <vm/seg_map.h>
67 67 #include <vm/seg_kmem.h>
68 68
69 69 #include <sys/strsubr.h>
70 70
71 71 /*
72 72 * These are the interface routines for the server side of the
73 73 * Network File System. See the NFS version 2 protocol specification
74 74 * for a description of this interface.
75 75 */
76 76
77 77 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
78 78 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 79 cred_t *);
80 80
81 81 /*
82 82 * Some "over the wire" UNIX file types. These are encoded
83 83 * into the mode. This needs to be fixed in the next rev.
84 84 */
85 85 #define IFMT 0170000 /* type of file */
86 86 #define IFCHR 0020000 /* character special */
87 87 #define IFBLK 0060000 /* block special */
88 88 #define IFSOCK 0140000 /* socket */
89 89
90 90 u_longlong_t nfs2_srv_caller_id;
91 91
92 92 /*
93 93 * Get file attributes.
94 94 * Returns the current attributes of the file with the given fhandle.
95 95 */
96 96 /* ARGSUSED */
97 97 void
98 98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99 99 struct svc_req *req, cred_t *cr, bool_t ro)
100 100 {
101 101 int error;
102 102 vnode_t *vp;
103 103 struct vattr va;
104 104
105 105 vp = nfs_fhtovp(fhp, exi);
106 106 if (vp == NULL) {
107 107 ns->ns_status = NFSERR_STALE;
108 108 return;
109 109 }
110 110
111 111 /*
112 112 * Do the getattr.
113 113 */
114 114 va.va_mask = AT_ALL; /* we want all the attributes */
115 115
116 116 error = rfs4_delegated_getattr(vp, &va, 0, cr);
117 117
118 118 /* check for overflows */
119 119 if (!error) {
120 120 /* Lie about the object type for a referral */
121 121 if (vn_is_nfs_reparse(vp, cr))
122 122 va.va_type = VLNK;
123 123
124 124 acl_perm(vp, exi, &va, cr);
125 125 error = vattr_to_nattr(&va, &ns->ns_attr);
126 126 }
127 127
128 128 VN_RELE(vp);
129 129
130 130 ns->ns_status = puterrno(error);
131 131 }
132 132 void *
133 133 rfs_getattr_getfh(fhandle_t *fhp)
134 134 {
135 135 return (fhp);
136 136 }
137 137
138 138 /*
139 139 * Set file attributes.
140 140 * Sets the attributes of the file with the given fhandle. Returns
141 141 * the new attributes.
142 142 */
143 143 /* ARGSUSED */
144 144 void
145 145 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
146 146 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
147 147 {
148 148 int error;
149 149 int flag;
150 150 int in_crit = 0;
151 151 vnode_t *vp;
152 152 struct vattr va;
153 153 struct vattr bva;
154 154 struct flock64 bf;
155 155 caller_context_t ct;
156 156
157 157
158 158 vp = nfs_fhtovp(&args->saa_fh, exi);
159 159 if (vp == NULL) {
160 160 ns->ns_status = NFSERR_STALE;
161 161 return;
162 162 }
163 163
164 164 if (rdonly(ro, vp)) {
165 165 VN_RELE(vp);
166 166 ns->ns_status = NFSERR_ROFS;
167 167 return;
168 168 }
169 169
170 170 error = sattr_to_vattr(&args->saa_sa, &va);
171 171 if (error) {
172 172 VN_RELE(vp);
173 173 ns->ns_status = puterrno(error);
174 174 return;
175 175 }
176 176
177 177 /*
178 178 * If the client is requesting a change to the mtime,
179 179 * but the nanosecond field is set to 1 billion, then
180 180 * this is a flag to the server that it should set the
181 181 * atime and mtime fields to the server's current time.
182 182 * The 1 billion number actually came from the client
183 183 * as 1 million, but the units in the over the wire
184 184 * request are microseconds instead of nanoseconds.
185 185 *
186 186 * This is an overload of the protocol and should be
187 187 * documented in the NFS Version 2 protocol specification.
188 188 */
189 189 if (va.va_mask & AT_MTIME) {
190 190 if (va.va_mtime.tv_nsec == 1000000000) {
191 191 gethrestime(&va.va_mtime);
192 192 va.va_atime = va.va_mtime;
193 193 va.va_mask |= AT_ATIME;
194 194 flag = 0;
195 195 } else
196 196 flag = ATTR_UTIME;
197 197 } else
198 198 flag = 0;
199 199
200 200 /*
201 201 * If the filesystem is exported with nosuid, then mask off
202 202 * the setuid and setgid bits.
203 203 */
204 204 if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
205 205 (exi->exi_export.ex_flags & EX_NOSUID))
206 206 va.va_mode &= ~(VSUID | VSGID);
207 207
208 208 ct.cc_sysid = 0;
209 209 ct.cc_pid = 0;
210 210 ct.cc_caller_id = nfs2_srv_caller_id;
211 211 ct.cc_flags = CC_DONTBLOCK;
212 212
213 213 /*
214 214 * We need to specially handle size changes because it is
215 215 * possible for the client to create a file with modes
216 216 * which indicate read-only, but with the file opened for
217 217 * writing. If the client then tries to set the size of
218 218 * the file, then the normal access checking done in
219 219 * VOP_SETATTR would prevent the client from doing so,
220 220 * although it should be legal for it to do so. To get
221 221 * around this, we do the access checking for ourselves
222 222 * and then use VOP_SPACE which doesn't do the access
223 223 * checking which VOP_SETATTR does. VOP_SPACE can only
224 224 * operate on VREG files, let VOP_SETATTR handle the other
225 225 * extremely rare cases.
226 226 * Also the client should not be allowed to change the
227 227 * size of the file if there is a conflicting non-blocking
228 228 * mandatory lock in the region of change.
229 229 */
230 230 if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
231 231 if (nbl_need_check(vp)) {
232 232 nbl_start_crit(vp, RW_READER);
233 233 in_crit = 1;
234 234 }
235 235
236 236 bva.va_mask = AT_UID | AT_SIZE;
237 237
238 238 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
239 239
240 240 if (error) {
241 241 if (in_crit)
242 242 nbl_end_crit(vp);
243 243 VN_RELE(vp);
244 244 ns->ns_status = puterrno(error);
245 245 return;
246 246 }
247 247
248 248 if (in_crit) {
249 249 u_offset_t offset;
250 250 ssize_t length;
251 251
252 252 if (va.va_size < bva.va_size) {
253 253 offset = va.va_size;
254 254 length = bva.va_size - va.va_size;
255 255 } else {
256 256 offset = bva.va_size;
257 257 length = va.va_size - bva.va_size;
258 258 }
259 259 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
260 260 NULL)) {
261 261 error = EACCES;
262 262 }
263 263 }
264 264
265 265 if (crgetuid(cr) == bva.va_uid && !error &&
266 266 va.va_size != bva.va_size) {
267 267 va.va_mask &= ~AT_SIZE;
268 268 bf.l_type = F_WRLCK;
269 269 bf.l_whence = 0;
270 270 bf.l_start = (off64_t)va.va_size;
271 271 bf.l_len = 0;
272 272 bf.l_sysid = 0;
273 273 bf.l_pid = 0;
274 274
275 275 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
276 276 (offset_t)va.va_size, cr, &ct);
277 277 }
278 278 if (in_crit)
279 279 nbl_end_crit(vp);
280 280 } else
281 281 error = 0;
282 282
283 283 /*
284 284 * Do the setattr.
285 285 */
286 286 if (!error && va.va_mask) {
287 287 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
288 288 }
289 289
290 290 /*
291 291 * check if the monitor on either vop_space or vop_setattr detected
292 292 * a delegation conflict and if so, mark the thread flag as
293 293 * wouldblock so that the response is dropped and the client will
294 294 * try again.
295 295 */
296 296 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
297 297 VN_RELE(vp);
298 298 curthread->t_flag |= T_WOULDBLOCK;
299 299 return;
300 300 }
301 301
302 302 if (!error) {
303 303 va.va_mask = AT_ALL; /* get everything */
304 304
305 305 error = rfs4_delegated_getattr(vp, &va, 0, cr);
306 306
307 307 /* check for overflows */
308 308 if (!error) {
309 309 acl_perm(vp, exi, &va, cr);
310 310 error = vattr_to_nattr(&va, &ns->ns_attr);
311 311 }
312 312 }
313 313
314 314 ct.cc_flags = 0;
315 315
316 316 /*
317 317 * Force modified metadata out to stable storage.
318 318 */
319 319 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
320 320
321 321 VN_RELE(vp);
322 322
323 323 ns->ns_status = puterrno(error);
324 324 }
325 325 void *
326 326 rfs_setattr_getfh(struct nfssaargs *args)
327 327 {
328 328 return (&args->saa_fh);
329 329 }
330 330
331 331 /*
332 332 * Directory lookup.
333 333 * Returns an fhandle and file attributes for file name in a directory.
334 334 */
335 335 /* ARGSUSED */
336 336 void
337 337 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
338 338 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
339 339 {
340 340 int error;
341 341 vnode_t *dvp;
342 342 vnode_t *vp;
343 343 struct vattr va;
344 344 fhandle_t *fhp = da->da_fhandle;
345 345 struct sec_ol sec = {0, 0};
346 346 bool_t publicfh_flag = FALSE, auth_weak = FALSE;
347 347 char *name;
348 348 struct sockaddr *ca;
349 349
350 350 /*
351 351 * Trusted Extension doesn't support NFSv2. MOUNT
352 352 * will reject v2 clients. Need to prevent v2 client
353 353 * access via WebNFS here.
354 354 */
355 355 if (is_system_labeled() && req->rq_vers == 2) {
356 356 dr->dr_status = NFSERR_ACCES;
357 357 return;
358 358 }
359 359
360 360 /*
361 361 * Disallow NULL paths
362 362 */
363 363 if (da->da_name == NULL || *da->da_name == '\0') {
364 364 dr->dr_status = NFSERR_ACCES;
365 365 return;
366 366 }
367 367
368 368 /*
369 369 * Allow lookups from the root - the default
370 370 * location of the public filehandle.
371 371 */
372 372 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
373 373 dvp = rootdir;
374 374 VN_HOLD(dvp);
375 375 } else {
376 376 dvp = nfs_fhtovp(fhp, exi);
377 377 if (dvp == NULL) {
378 378 dr->dr_status = NFSERR_STALE;
379 379 return;
380 380 }
381 381 }
382 382
383 383 /*
384 384 * Not allow lookup beyond root.
385 385 * If the filehandle matches a filehandle of the exi,
386 386 * then the ".." refers beyond the root of an exported filesystem.
387 387 */
388 388 if (strcmp(da->da_name, "..") == 0 &&
389 389 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
390 390 VN_RELE(dvp);
391 391 dr->dr_status = NFSERR_NOENT;
392 392 return;
393 393 }
394 394
395 395 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
396 396 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
397 397 MAXPATHLEN);
398 398
399 399 if (name == NULL) {
400 400 dr->dr_status = NFSERR_ACCES;
401 401 return;
402 402 }
403 403
404 404 /*
405 405 * If the public filehandle is used then allow
406 406 * a multi-component lookup, i.e. evaluate
407 407 * a pathname and follow symbolic links if
408 408 * necessary.
409 409 *
410 410 * This may result in a vnode in another filesystem
411 411 * which is OK as long as the filesystem is exported.
412 412 */
413 413 if (PUBLIC_FH2(fhp)) {
414 414 publicfh_flag = TRUE;
415 415 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
416 416 &sec);
417 417 } else {
418 418 /*
419 419 * Do a normal single component lookup.
420 420 */
421 421 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
422 422 NULL, NULL, NULL);
423 423 }
424 424
425 425 if (name != da->da_name)
426 426 kmem_free(name, MAXPATHLEN);
427 427
428 428
429 429 if (!error) {
430 430 va.va_mask = AT_ALL; /* we want everything */
431 431
432 432 error = rfs4_delegated_getattr(vp, &va, 0, cr);
433 433
434 434 /* check for overflows */
435 435 if (!error) {
436 436 acl_perm(vp, exi, &va, cr);
437 437 error = vattr_to_nattr(&va, &dr->dr_attr);
438 438 if (!error) {
439 439 if (sec.sec_flags & SEC_QUERY)
440 440 error = makefh_ol(&dr->dr_fhandle, exi,
441 441 sec.sec_index);
442 442 else {
443 443 error = makefh(&dr->dr_fhandle, vp,
444 444 exi);
445 445 if (!error && publicfh_flag &&
446 446 !chk_clnt_sec(exi, req))
447 447 auth_weak = TRUE;
448 448 }
449 449 }
450 450 }
451 451 VN_RELE(vp);
452 452 }
453 453
454 454 VN_RELE(dvp);
455 455
456 456 /*
457 457 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
458 458 * and have obtained a new exportinfo in exi which needs to be
459 459 * released. Note the the original exportinfo pointed to by exi
460 460 * will be released by the caller, comon_dispatch.
461 461 */
462 462 if (publicfh_flag && exi != NULL)
463 463 exi_rele(exi);
464 464
465 465 /*
466 466 * If it's public fh, no 0x81, and client's flavor is
467 467 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
468 468 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
469 469 */
470 470 if (auth_weak)
471 471 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
472 472 else
473 473 dr->dr_status = puterrno(error);
474 474 }
475 475 void *
476 476 rfs_lookup_getfh(struct nfsdiropargs *da)
477 477 {
478 478 return (da->da_fhandle);
479 479 }
480 480
481 481 /*
482 482 * Read symbolic link.
483 483 * Returns the string in the symbolic link at the given fhandle.
484 484 */
485 485 /* ARGSUSED */
486 486 void
487 487 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
488 488 struct svc_req *req, cred_t *cr, bool_t ro)
489 489 {
490 490 int error;
491 491 struct iovec iov;
492 492 struct uio uio;
493 493 vnode_t *vp;
494 494 struct vattr va;
495 495 struct sockaddr *ca;
496 496 char *name = NULL;
497 497 int is_referral = 0;
498 498
499 499 vp = nfs_fhtovp(fhp, exi);
500 500 if (vp == NULL) {
501 501 rl->rl_data = NULL;
502 502 rl->rl_status = NFSERR_STALE;
503 503 return;
504 504 }
505 505
506 506 va.va_mask = AT_MODE;
507 507
508 508 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
509 509
510 510 if (error) {
511 511 VN_RELE(vp);
512 512 rl->rl_data = NULL;
513 513 rl->rl_status = puterrno(error);
514 514 return;
515 515 }
516 516
517 517 if (MANDLOCK(vp, va.va_mode)) {
518 518 VN_RELE(vp);
519 519 rl->rl_data = NULL;
520 520 rl->rl_status = NFSERR_ACCES;
521 521 return;
522 522 }
523 523
524 524 /* We lied about the object type for a referral */
525 525 if (vn_is_nfs_reparse(vp, cr))
526 526 is_referral = 1;
527 527
528 528 /*
529 529 * XNFS and RFC1094 require us to return ENXIO if argument
530 530 * is not a link. BUGID 1138002.
531 531 */
532 532 if (vp->v_type != VLNK && !is_referral) {
533 533 VN_RELE(vp);
534 534 rl->rl_data = NULL;
535 535 rl->rl_status = NFSERR_NXIO;
536 536 return;
537 537 }
538 538
539 539 /*
540 540 * Allocate data for pathname. This will be freed by rfs_rlfree.
541 541 */
542 542 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
543 543
544 544 if (is_referral) {
545 545 char *s;
546 546 size_t strsz;
547 547
548 548 /* Get an artificial symlink based on a referral */
549 549 s = build_symlink(vp, cr, &strsz);
550 550 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
551 551 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
552 552 vnode_t *, vp, char *, s);
553 553 if (s == NULL)
554 554 error = EINVAL;
555 555 else {
556 556 error = 0;
557 557 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
558 558 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
559 559 kmem_free(s, strsz);
560 560 }
561 561
562 562 } else {
563 563
564 564 /*
565 565 * Set up io vector to read sym link data
566 566 */
567 567 iov.iov_base = rl->rl_data;
568 568 iov.iov_len = NFS_MAXPATHLEN;
569 569 uio.uio_iov = &iov;
570 570 uio.uio_iovcnt = 1;
571 571 uio.uio_segflg = UIO_SYSSPACE;
572 572 uio.uio_extflg = UIO_COPY_CACHED;
573 573 uio.uio_loffset = (offset_t)0;
574 574 uio.uio_resid = NFS_MAXPATHLEN;
575 575
576 576 /*
577 577 * Do the readlink.
578 578 */
579 579 error = VOP_READLINK(vp, &uio, cr, NULL);
580 580
581 581 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
582 582
583 583 if (!error)
584 584 rl->rl_data[rl->rl_count] = '\0';
585 585
586 586 }
587 587
588 588
589 589 VN_RELE(vp);
590 590
591 591 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
592 592 name = nfscmd_convname(ca, exi, rl->rl_data,
593 593 NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
594 594
595 595 if (name != NULL && name != rl->rl_data) {
596 596 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
597 597 rl->rl_data = name;
598 598 }
599 599
600 600 /*
601 601 * XNFS and RFC1094 require us to return ENXIO if argument
602 602 * is not a link. UFS returns EINVAL if this is the case,
603 603 * so we do the mapping here. BUGID 1138002.
604 604 */
605 605 if (error == EINVAL)
606 606 rl->rl_status = NFSERR_NXIO;
607 607 else
608 608 rl->rl_status = puterrno(error);
609 609
610 610 }
611 611 void *
612 612 rfs_readlink_getfh(fhandle_t *fhp)
613 613 {
614 614 return (fhp);
615 615 }
616 616 /*
617 617 * Free data allocated by rfs_readlink
618 618 */
619 619 void
620 620 rfs_rlfree(struct nfsrdlnres *rl)
621 621 {
622 622 if (rl->rl_data != NULL)
623 623 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
624 624 }
625 625
626 626 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
627 627
628 628 /*
629 629 * Read data.
630 630 * Returns some data read from the file at the given fhandle.
631 631 */
632 632 /* ARGSUSED */
633 633 void
634 634 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
635 635 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
636 636 {
637 637 vnode_t *vp;
638 638 int error;
639 639 struct vattr va;
640 640 struct iovec iov;
641 641 struct uio uio;
642 642 mblk_t *mp;
643 643 int alloc_err = 0;
644 644 int in_crit = 0;
645 645 caller_context_t ct;
646 646
647 647 vp = nfs_fhtovp(&ra->ra_fhandle, exi);
648 648 if (vp == NULL) {
649 649 rr->rr_data = NULL;
650 650 rr->rr_status = NFSERR_STALE;
651 651 return;
652 652 }
653 653
654 654 if (vp->v_type != VREG) {
655 655 VN_RELE(vp);
656 656 rr->rr_data = NULL;
657 657 rr->rr_status = NFSERR_ISDIR;
658 658 return;
659 659 }
660 660
661 661 ct.cc_sysid = 0;
662 662 ct.cc_pid = 0;
663 663 ct.cc_caller_id = nfs2_srv_caller_id;
664 664 ct.cc_flags = CC_DONTBLOCK;
665 665
666 666 /*
667 667 * Enter the critical region before calling VOP_RWLOCK
668 668 * to avoid a deadlock with write requests.
669 669 */
670 670 if (nbl_need_check(vp)) {
671 671 nbl_start_crit(vp, RW_READER);
672 672 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
673 673 0, NULL)) {
674 674 nbl_end_crit(vp);
675 675 VN_RELE(vp);
676 676 rr->rr_data = NULL;
677 677 rr->rr_status = NFSERR_ACCES;
678 678 return;
679 679 }
680 680 in_crit = 1;
681 681 }
682 682
683 683 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
684 684
685 685 /* check if a monitor detected a delegation conflict */
686 686 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
687 687 VN_RELE(vp);
688 688 /* mark as wouldblock so response is dropped */
689 689 curthread->t_flag |= T_WOULDBLOCK;
690 690
691 691 rr->rr_data = NULL;
692 692 return;
693 693 }
694 694
695 695 va.va_mask = AT_ALL;
696 696
697 697 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
698 698
699 699 if (error) {
700 700 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
701 701 if (in_crit)
702 702 nbl_end_crit(vp);
703 703
704 704 VN_RELE(vp);
705 705 rr->rr_data = NULL;
706 706 rr->rr_status = puterrno(error);
707 707
708 708 return;
709 709 }
710 710
711 711 /*
712 712 * This is a kludge to allow reading of files created
713 713 * with no read permission. The owner of the file
714 714 * is always allowed to read it.
715 715 */
716 716 if (crgetuid(cr) != va.va_uid) {
717 717 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
718 718
719 719 if (error) {
720 720 /*
721 721 * Exec is the same as read over the net because
722 722 * of demand loading.
723 723 */
724 724 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
725 725 }
726 726 if (error) {
727 727 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
728 728 if (in_crit)
729 729 nbl_end_crit(vp);
730 730 VN_RELE(vp);
731 731 rr->rr_data = NULL;
732 732 rr->rr_status = puterrno(error);
733 733
734 734 return;
735 735 }
736 736 }
737 737
738 738 if (MANDLOCK(vp, va.va_mode)) {
739 739 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
740 740 if (in_crit)
741 741 nbl_end_crit(vp);
742 742
743 743 VN_RELE(vp);
744 744 rr->rr_data = NULL;
745 745 rr->rr_status = NFSERR_ACCES;
746 746
747 747 return;
748 748 }
749 749
750 750 rr->rr_ok.rrok_wlist_len = 0;
751 751 rr->rr_ok.rrok_wlist = NULL;
752 752
753 753 if ((u_offset_t)ra->ra_offset >= va.va_size) {
754 754 rr->rr_count = 0;
755 755 rr->rr_data = NULL;
756 756 /*
757 757 * In this case, status is NFS_OK, but there is no data
758 758 * to encode. So set rr_mp to NULL.
759 759 */
760 760 rr->rr_mp = NULL;
761 761 rr->rr_ok.rrok_wlist = ra->ra_wlist;
762 762 if (rr->rr_ok.rrok_wlist)
763 763 clist_zero_len(rr->rr_ok.rrok_wlist);
764 764 goto done;
765 765 }
766 766
767 767 if (ra->ra_wlist) {
768 768 mp = NULL;
769 769 rr->rr_mp = NULL;
770 770 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
771 771 if (ra->ra_count > iov.iov_len) {
772 772 rr->rr_data = NULL;
773 773 rr->rr_status = NFSERR_INVAL;
774 774 goto done;
775 775 }
776 776 } else {
777 777 /*
778 778 * mp will contain the data to be sent out in the read reply.
779 779 * This will be freed after the reply has been sent out (by the
780 780 * driver).
781 781 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
782 782 * that the call to xdrmblk_putmblk() never fails.
783 783 */
784 784 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
785 785 &alloc_err);
786 786 ASSERT(mp != NULL);
787 787 ASSERT(alloc_err == 0);
788 788
789 789 rr->rr_mp = mp;
790 790
791 791 /*
792 792 * Set up io vector
793 793 */
794 794 iov.iov_base = (caddr_t)mp->b_datap->db_base;
795 795 iov.iov_len = ra->ra_count;
796 796 }
797 797
798 798 uio.uio_iov = &iov;
799 799 uio.uio_iovcnt = 1;
800 800 uio.uio_segflg = UIO_SYSSPACE;
801 801 uio.uio_extflg = UIO_COPY_CACHED;
802 802 uio.uio_loffset = (offset_t)ra->ra_offset;
803 803 uio.uio_resid = ra->ra_count;
804 804
805 805 error = VOP_READ(vp, &uio, 0, cr, &ct);
806 806
807 807 if (error) {
808 808 if (mp)
809 809 freeb(mp);
810 810
811 811 /*
812 812 * check if a monitor detected a delegation conflict and
813 813 * mark as wouldblock so response is dropped
814 814 */
815 815 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
816 816 curthread->t_flag |= T_WOULDBLOCK;
817 817 else
818 818 rr->rr_status = puterrno(error);
819 819
820 820 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
821 821 if (in_crit)
822 822 nbl_end_crit(vp);
823 823
824 824 VN_RELE(vp);
825 825 rr->rr_data = NULL;
826 826
827 827 return;
828 828 }
829 829
830 830 /*
831 831 * Get attributes again so we can send the latest access
832 832 * time to the client side for his cache.
833 833 */
834 834 va.va_mask = AT_ALL;
835 835
836 836 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
837 837
838 838 if (error) {
839 839 if (mp)
840 840 freeb(mp);
841 841
842 842 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
843 843 if (in_crit)
844 844 nbl_end_crit(vp);
845 845
846 846 VN_RELE(vp);
847 847 rr->rr_data = NULL;
848 848 rr->rr_status = puterrno(error);
849 849
850 850 return;
851 851 }
852 852
853 853 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
854 854
855 855 if (mp) {
856 856 rr->rr_data = (char *)mp->b_datap->db_base;
857 857 } else {
858 858 if (ra->ra_wlist) {
859 859 rr->rr_data = (caddr_t)iov.iov_base;
860 860 if (!rdma_setup_read_data2(ra, rr)) {
861 861 rr->rr_data = NULL;
862 862 rr->rr_status = puterrno(NFSERR_INVAL);
863 863 }
864 864 }
865 865 }
866 866 done:
867 867 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
868 868 if (in_crit)
869 869 nbl_end_crit(vp);
870 870
871 871 acl_perm(vp, exi, &va, cr);
872 872
873 873 /* check for overflows */
874 874 error = vattr_to_nattr(&va, &rr->rr_attr);
875 875
876 876 VN_RELE(vp);
877 877
878 878 rr->rr_status = puterrno(error);
879 879 }
880 880
881 881 /*
882 882 * Free data allocated by rfs_read
883 883 */
884 884 void
885 885 rfs_rdfree(struct nfsrdresult *rr)
886 886 {
887 887 mblk_t *mp;
888 888
889 889 if (rr->rr_status == NFS_OK) {
890 890 mp = rr->rr_mp;
891 891 if (mp != NULL)
892 892 freeb(mp);
893 893 }
894 894 }
895 895
896 896 void *
897 897 rfs_read_getfh(struct nfsreadargs *ra)
898 898 {
899 899 return (&ra->ra_fhandle);
900 900 }
901 901
902 902 #define MAX_IOVECS 12
903 903
904 904 #ifdef DEBUG
905 905 static int rfs_write_sync_hits = 0;
906 906 static int rfs_write_sync_misses = 0;
907 907 #endif
908 908
909 909 /*
910 910 * Write data to file.
911 911 * Returns attributes of a file after writing some data to it.
912 912 *
913 913 * Any changes made here, especially in error handling might have
914 914 * to also be done in rfs_write (which clusters write requests).
915 915 */
916 916 /* ARGSUSED */
917 917 void
918 918 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
919 919 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
920 920 {
921 921 int error;
922 922 vnode_t *vp;
923 923 rlim64_t rlimit;
924 924 struct vattr va;
925 925 struct uio uio;
926 926 struct iovec iov[MAX_IOVECS];
927 927 mblk_t *m;
928 928 struct iovec *iovp;
929 929 int iovcnt;
930 930 cred_t *savecred;
931 931 int in_crit = 0;
932 932 caller_context_t ct;
933 933
934 934 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
935 935 if (vp == NULL) {
936 936 ns->ns_status = NFSERR_STALE;
937 937 return;
938 938 }
939 939
940 940 if (rdonly(ro, vp)) {
941 941 VN_RELE(vp);
942 942 ns->ns_status = NFSERR_ROFS;
943 943 return;
944 944 }
945 945
946 946 if (vp->v_type != VREG) {
947 947 VN_RELE(vp);
948 948 ns->ns_status = NFSERR_ISDIR;
949 949 return;
950 950 }
951 951
952 952 ct.cc_sysid = 0;
953 953 ct.cc_pid = 0;
954 954 ct.cc_caller_id = nfs2_srv_caller_id;
955 955 ct.cc_flags = CC_DONTBLOCK;
956 956
957 957 va.va_mask = AT_UID|AT_MODE;
958 958
959 959 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
960 960
961 961 if (error) {
962 962 VN_RELE(vp);
963 963 ns->ns_status = puterrno(error);
964 964
965 965 return;
966 966 }
967 967
968 968 if (crgetuid(cr) != va.va_uid) {
969 969 /*
970 970 * This is a kludge to allow writes of files created
971 971 * with read only permission. The owner of the file
972 972 * is always allowed to write it.
973 973 */
974 974 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
975 975
976 976 if (error) {
977 977 VN_RELE(vp);
978 978 ns->ns_status = puterrno(error);
979 979 return;
980 980 }
981 981 }
982 982
983 983 /*
984 984 * Can't access a mandatory lock file. This might cause
985 985 * the NFS service thread to block forever waiting for a
986 986 * lock to be released that will never be released.
987 987 */
988 988 if (MANDLOCK(vp, va.va_mode)) {
989 989 VN_RELE(vp);
990 990 ns->ns_status = NFSERR_ACCES;
991 991 return;
992 992 }
993 993
994 994 /*
995 995 * We have to enter the critical region before calling VOP_RWLOCK
996 996 * to avoid a deadlock with ufs.
997 997 */
998 998 if (nbl_need_check(vp)) {
999 999 nbl_start_crit(vp, RW_READER);
1000 1000 in_crit = 1;
1001 1001 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1002 1002 wa->wa_count, 0, NULL)) {
1003 1003 error = EACCES;
1004 1004 goto out;
1005 1005 }
1006 1006 }
1007 1007
1008 1008 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1009 1009
1010 1010 /* check if a monitor detected a delegation conflict */
1011 1011 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1012 1012 VN_RELE(vp);
1013 1013 /* mark as wouldblock so response is dropped */
1014 1014 curthread->t_flag |= T_WOULDBLOCK;
1015 1015 return;
1016 1016 }
1017 1017
1018 1018 if (wa->wa_data || wa->wa_rlist) {
1019 1019 /* Do the RDMA thing if necessary */
1020 1020 if (wa->wa_rlist) {
1021 1021 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1022 1022 iov[0].iov_len = wa->wa_count;
1023 1023 } else {
1024 1024 iov[0].iov_base = wa->wa_data;
1025 1025 iov[0].iov_len = wa->wa_count;
1026 1026 }
1027 1027 uio.uio_iov = iov;
1028 1028 uio.uio_iovcnt = 1;
1029 1029 uio.uio_segflg = UIO_SYSSPACE;
1030 1030 uio.uio_extflg = UIO_COPY_DEFAULT;
1031 1031 uio.uio_loffset = (offset_t)wa->wa_offset;
1032 1032 uio.uio_resid = wa->wa_count;
1033 1033 /*
1034 1034 * The limit is checked on the client. We
1035 1035 * should allow any size writes here.
1036 1036 */
1037 1037 uio.uio_llimit = curproc->p_fsz_ctl;
1038 1038 rlimit = uio.uio_llimit - wa->wa_offset;
1039 1039 if (rlimit < (rlim64_t)uio.uio_resid)
1040 1040 uio.uio_resid = (uint_t)rlimit;
1041 1041
1042 1042 /*
1043 1043 * for now we assume no append mode
1044 1044 */
1045 1045 /*
1046 1046 * We're changing creds because VM may fault and we need
1047 1047 * the cred of the current thread to be used if quota
1048 1048 * checking is enabled.
1049 1049 */
1050 1050 savecred = curthread->t_cred;
1051 1051 curthread->t_cred = cr;
1052 1052 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1053 1053 curthread->t_cred = savecred;
1054 1054 } else {
1055 1055 iovcnt = 0;
1056 1056 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1057 1057 iovcnt++;
1058 1058 if (iovcnt <= MAX_IOVECS) {
1059 1059 #ifdef DEBUG
1060 1060 rfs_write_sync_hits++;
1061 1061 #endif
1062 1062 iovp = iov;
1063 1063 } else {
1064 1064 #ifdef DEBUG
1065 1065 rfs_write_sync_misses++;
1066 1066 #endif
1067 1067 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1068 1068 }
1069 1069 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1070 1070 uio.uio_iov = iovp;
1071 1071 uio.uio_iovcnt = iovcnt;
1072 1072 uio.uio_segflg = UIO_SYSSPACE;
1073 1073 uio.uio_extflg = UIO_COPY_DEFAULT;
1074 1074 uio.uio_loffset = (offset_t)wa->wa_offset;
1075 1075 uio.uio_resid = wa->wa_count;
1076 1076 /*
1077 1077 * The limit is checked on the client. We
1078 1078 * should allow any size writes here.
1079 1079 */
1080 1080 uio.uio_llimit = curproc->p_fsz_ctl;
1081 1081 rlimit = uio.uio_llimit - wa->wa_offset;
1082 1082 if (rlimit < (rlim64_t)uio.uio_resid)
1083 1083 uio.uio_resid = (uint_t)rlimit;
1084 1084
1085 1085 /*
1086 1086 * For now we assume no append mode.
1087 1087 */
1088 1088 /*
1089 1089 * We're changing creds because VM may fault and we need
1090 1090 * the cred of the current thread to be used if quota
1091 1091 * checking is enabled.
1092 1092 */
1093 1093 savecred = curthread->t_cred;
1094 1094 curthread->t_cred = cr;
1095 1095 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1096 1096 curthread->t_cred = savecred;
1097 1097
1098 1098 if (iovp != iov)
1099 1099 kmem_free(iovp, sizeof (*iovp) * iovcnt);
1100 1100 }
1101 1101
1102 1102 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1103 1103
1104 1104 if (!error) {
1105 1105 /*
1106 1106 * Get attributes again so we send the latest mod
1107 1107 * time to the client side for his cache.
1108 1108 */
1109 1109 va.va_mask = AT_ALL; /* now we want everything */
1110 1110
1111 1111 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1112 1112
1113 1113 /* check for overflows */
1114 1114 if (!error) {
1115 1115 acl_perm(vp, exi, &va, cr);
1116 1116 error = vattr_to_nattr(&va, &ns->ns_attr);
1117 1117 }
1118 1118 }
1119 1119
1120 1120 out:
1121 1121 if (in_crit)
1122 1122 nbl_end_crit(vp);
1123 1123 VN_RELE(vp);
1124 1124
1125 1125 /* check if a monitor detected a delegation conflict */
1126 1126 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1127 1127 /* mark as wouldblock so response is dropped */
1128 1128 curthread->t_flag |= T_WOULDBLOCK;
1129 1129 else
1130 1130 ns->ns_status = puterrno(error);
1131 1131
1132 1132 }
1133 1133
1134 1134 struct rfs_async_write {
1135 1135 struct nfswriteargs *wa;
1136 1136 struct nfsattrstat *ns;
1137 1137 struct svc_req *req;
1138 1138 cred_t *cr;
1139 1139 bool_t ro;
1140 1140 kthread_t *thread;
1141 1141 struct rfs_async_write *list;
1142 1142 };
1143 1143
1144 1144 struct rfs_async_write_list {
1145 1145 fhandle_t *fhp;
1146 1146 kcondvar_t cv;
1147 1147 struct rfs_async_write *list;
1148 1148 struct rfs_async_write_list *next;
1149 1149 };
1150 1150
1151 1151 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1152 1152 static kmutex_t rfs_async_write_lock;
1153 1153 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1154 1154
1155 1155 #define MAXCLIOVECS 42
1156 1156 #define RFSWRITE_INITVAL (enum nfsstat) -1
1157 1157
1158 1158 #ifdef DEBUG
1159 1159 static int rfs_write_hits = 0;
1160 1160 static int rfs_write_misses = 0;
1161 1161 #endif
1162 1162
1163 1163 /*
1164 1164 * Write data to file.
1165 1165 * Returns attributes of a file after writing some data to it.
1166 1166 */
1167 1167 void
1168 1168 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1169 1169 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1170 1170 {
1171 1171 int error;
1172 1172 vnode_t *vp;
1173 1173 rlim64_t rlimit;
1174 1174 struct vattr va;
1175 1175 struct uio uio;
1176 1176 struct rfs_async_write_list *lp;
1177 1177 struct rfs_async_write_list *nlp;
1178 1178 struct rfs_async_write *rp;
1179 1179 struct rfs_async_write *nrp;
1180 1180 struct rfs_async_write *trp;
1181 1181 struct rfs_async_write *lrp;
1182 1182 int data_written;
1183 1183 int iovcnt;
1184 1184 mblk_t *m;
1185 1185 struct iovec *iovp;
1186 1186 struct iovec *niovp;
1187 1187 struct iovec iov[MAXCLIOVECS];
1188 1188 int count;
1189 1189 int rcount;
1190 1190 uint_t off;
1191 1191 uint_t len;
1192 1192 struct rfs_async_write nrpsp;
1193 1193 struct rfs_async_write_list nlpsp;
1194 1194 ushort_t t_flag;
1195 1195 cred_t *savecred;
1196 1196 int in_crit = 0;
1197 1197 caller_context_t ct;
1198 1198
1199 1199 if (!rfs_write_async) {
1200 1200 rfs_write_sync(wa, ns, exi, req, cr, ro);
1201 1201 return;
1202 1202 }
1203 1203
1204 1204 /*
1205 1205 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1206 1206 * is considered an OK.
1207 1207 */
↓ open down ↓ |
1207 lines elided |
↑ open up ↑ |
1208 1208 ns->ns_status = RFSWRITE_INITVAL;
1209 1209
1210 1210 nrp = &nrpsp;
1211 1211 nrp->wa = wa;
1212 1212 nrp->ns = ns;
1213 1213 nrp->req = req;
1214 1214 nrp->cr = cr;
1215 1215 nrp->ro = ro;
1216 1216 nrp->thread = curthread;
1217 1217
1218 - ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1219 -
1220 1218 /*
1221 1219 * Look to see if there is already a cluster started
1222 1220 * for this file.
1223 1221 */
1224 1222 mutex_enter(&rfs_async_write_lock);
1225 1223 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1226 1224 if (bcmp(&wa->wa_fhandle, lp->fhp,
1227 1225 sizeof (fhandle_t)) == 0)
1228 1226 break;
1229 1227 }
1230 1228
1231 1229 /*
1232 1230 * If lp is non-NULL, then there is already a cluster
1233 1231 * started. We need to place ourselves in the cluster
1234 1232 * list in the right place as determined by starting
1235 1233 * offset. Conflicts with non-blocking mandatory locked
1236 1234 * regions will be checked when the cluster is processed.
1237 1235 */
1238 1236 if (lp != NULL) {
1239 1237 rp = lp->list;
1240 1238 trp = NULL;
1241 1239 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1242 1240 trp = rp;
1243 1241 rp = rp->list;
1244 1242 }
1245 1243 nrp->list = rp;
1246 1244 if (trp == NULL)
1247 1245 lp->list = nrp;
1248 1246 else
1249 1247 trp->list = nrp;
1250 1248 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1251 1249 cv_wait(&lp->cv, &rfs_async_write_lock);
1252 1250 mutex_exit(&rfs_async_write_lock);
1253 1251
1254 1252 return;
1255 1253 }
1256 1254
1257 1255 /*
1258 1256 * No cluster started yet, start one and add ourselves
1259 1257 * to the list of clusters.
1260 1258 */
1261 1259 nrp->list = NULL;
1262 1260
1263 1261 nlp = &nlpsp;
1264 1262 nlp->fhp = &wa->wa_fhandle;
1265 1263 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1266 1264 nlp->list = nrp;
1267 1265 nlp->next = NULL;
1268 1266
1269 1267 if (rfs_async_write_head == NULL) {
1270 1268 rfs_async_write_head = nlp;
1271 1269 } else {
1272 1270 lp = rfs_async_write_head;
1273 1271 while (lp->next != NULL)
1274 1272 lp = lp->next;
1275 1273 lp->next = nlp;
1276 1274 }
1277 1275 mutex_exit(&rfs_async_write_lock);
1278 1276
1279 1277 /*
1280 1278 * Convert the file handle common to all of the requests
1281 1279 * in this cluster to a vnode.
1282 1280 */
1283 1281 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1284 1282 if (vp == NULL) {
1285 1283 mutex_enter(&rfs_async_write_lock);
1286 1284 if (rfs_async_write_head == nlp)
1287 1285 rfs_async_write_head = nlp->next;
1288 1286 else {
1289 1287 lp = rfs_async_write_head;
1290 1288 while (lp->next != nlp)
1291 1289 lp = lp->next;
1292 1290 lp->next = nlp->next;
1293 1291 }
1294 1292 t_flag = curthread->t_flag & T_WOULDBLOCK;
1295 1293 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1296 1294 rp->ns->ns_status = NFSERR_STALE;
1297 1295 rp->thread->t_flag |= t_flag;
1298 1296 }
1299 1297 cv_broadcast(&nlp->cv);
1300 1298 mutex_exit(&rfs_async_write_lock);
1301 1299
1302 1300 return;
1303 1301 }
1304 1302
1305 1303 /*
1306 1304 * Can only write regular files. Attempts to write any
1307 1305 * other file types fail with EISDIR.
1308 1306 */
1309 1307 if (vp->v_type != VREG) {
1310 1308 VN_RELE(vp);
1311 1309 mutex_enter(&rfs_async_write_lock);
1312 1310 if (rfs_async_write_head == nlp)
1313 1311 rfs_async_write_head = nlp->next;
1314 1312 else {
1315 1313 lp = rfs_async_write_head;
1316 1314 while (lp->next != nlp)
1317 1315 lp = lp->next;
1318 1316 lp->next = nlp->next;
1319 1317 }
1320 1318 t_flag = curthread->t_flag & T_WOULDBLOCK;
1321 1319 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1322 1320 rp->ns->ns_status = NFSERR_ISDIR;
1323 1321 rp->thread->t_flag |= t_flag;
1324 1322 }
1325 1323 cv_broadcast(&nlp->cv);
1326 1324 mutex_exit(&rfs_async_write_lock);
1327 1325
1328 1326 return;
1329 1327 }
1330 1328
1331 1329 /*
1332 1330 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1333 1331 * deadlock with ufs.
1334 1332 */
1335 1333 if (nbl_need_check(vp)) {
1336 1334 nbl_start_crit(vp, RW_READER);
1337 1335 in_crit = 1;
1338 1336 }
1339 1337
1340 1338 ct.cc_sysid = 0;
1341 1339 ct.cc_pid = 0;
1342 1340 ct.cc_caller_id = nfs2_srv_caller_id;
1343 1341 ct.cc_flags = CC_DONTBLOCK;
1344 1342
1345 1343 /*
1346 1344 * Lock the file for writing. This operation provides
1347 1345 * the delay which allows clusters to grow.
1348 1346 */
1349 1347 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1350 1348
1351 1349 /* check if a monitor detected a delegation conflict */
1352 1350 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1353 1351 if (in_crit)
1354 1352 nbl_end_crit(vp);
1355 1353 VN_RELE(vp);
1356 1354 /* mark as wouldblock so response is dropped */
1357 1355 curthread->t_flag |= T_WOULDBLOCK;
1358 1356 mutex_enter(&rfs_async_write_lock);
1359 1357 if (rfs_async_write_head == nlp)
1360 1358 rfs_async_write_head = nlp->next;
1361 1359 else {
1362 1360 lp = rfs_async_write_head;
1363 1361 while (lp->next != nlp)
1364 1362 lp = lp->next;
1365 1363 lp->next = nlp->next;
1366 1364 }
1367 1365 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1368 1366 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1369 1367 rp->ns->ns_status = puterrno(error);
1370 1368 rp->thread->t_flag |= T_WOULDBLOCK;
1371 1369 }
1372 1370 }
1373 1371 cv_broadcast(&nlp->cv);
1374 1372 mutex_exit(&rfs_async_write_lock);
1375 1373
1376 1374 return;
1377 1375 }
1378 1376
1379 1377 /*
1380 1378 * Disconnect this cluster from the list of clusters.
1381 1379 * The cluster that is being dealt with must be fixed
1382 1380 * in size after this point, so there is no reason
1383 1381 * to leave it on the list so that new requests can
1384 1382 * find it.
1385 1383 *
1386 1384 * The algorithm is that the first write request will
1387 1385 * create a cluster, convert the file handle to a
1388 1386 * vnode pointer, and then lock the file for writing.
1389 1387 * This request is not likely to be clustered with
1390 1388 * any others. However, the next request will create
1391 1389 * a new cluster and be blocked in VOP_RWLOCK while
1392 1390 * the first request is being processed. This delay
1393 1391 * will allow more requests to be clustered in this
1394 1392 * second cluster.
1395 1393 */
1396 1394 mutex_enter(&rfs_async_write_lock);
1397 1395 if (rfs_async_write_head == nlp)
1398 1396 rfs_async_write_head = nlp->next;
1399 1397 else {
1400 1398 lp = rfs_async_write_head;
1401 1399 while (lp->next != nlp)
1402 1400 lp = lp->next;
1403 1401 lp->next = nlp->next;
1404 1402 }
1405 1403 mutex_exit(&rfs_async_write_lock);
1406 1404
1407 1405 /*
1408 1406 * Step through the list of requests in this cluster.
1409 1407 * We need to check permissions to make sure that all
1410 1408 * of the requests have sufficient permission to write
1411 1409 * the file. A cluster can be composed of requests
1412 1410 * from different clients and different users on each
1413 1411 * client.
1414 1412 *
1415 1413 * As a side effect, we also calculate the size of the
1416 1414 * byte range that this cluster encompasses.
1417 1415 */
1418 1416 rp = nlp->list;
1419 1417 off = rp->wa->wa_offset;
1420 1418 len = (uint_t)0;
1421 1419 do {
1422 1420 if (rdonly(rp->ro, vp)) {
1423 1421 rp->ns->ns_status = NFSERR_ROFS;
1424 1422 t_flag = curthread->t_flag & T_WOULDBLOCK;
1425 1423 rp->thread->t_flag |= t_flag;
1426 1424 continue;
1427 1425 }
1428 1426
1429 1427 va.va_mask = AT_UID|AT_MODE;
1430 1428
1431 1429 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1432 1430
1433 1431 if (!error) {
1434 1432 if (crgetuid(rp->cr) != va.va_uid) {
1435 1433 /*
1436 1434 * This is a kludge to allow writes of files
1437 1435 * created with read only permission. The
1438 1436 * owner of the file is always allowed to
1439 1437 * write it.
1440 1438 */
1441 1439 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1442 1440 }
1443 1441 if (!error && MANDLOCK(vp, va.va_mode))
1444 1442 error = EACCES;
1445 1443 }
1446 1444
1447 1445 /*
1448 1446 * Check for a conflict with a nbmand-locked region.
1449 1447 */
1450 1448 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1451 1449 rp->wa->wa_count, 0, NULL)) {
1452 1450 error = EACCES;
1453 1451 }
1454 1452
1455 1453 if (error) {
1456 1454 rp->ns->ns_status = puterrno(error);
1457 1455 t_flag = curthread->t_flag & T_WOULDBLOCK;
1458 1456 rp->thread->t_flag |= t_flag;
1459 1457 continue;
1460 1458 }
1461 1459 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1462 1460 len = rp->wa->wa_offset + rp->wa->wa_count - off;
1463 1461 } while ((rp = rp->list) != NULL);
1464 1462
1465 1463 /*
1466 1464 * Step through the cluster attempting to gather as many
1467 1465 * requests which are contiguous as possible. These
1468 1466 * contiguous requests are handled via one call to VOP_WRITE
1469 1467 * instead of different calls to VOP_WRITE. We also keep
1470 1468 * track of the fact that any data was written.
1471 1469 */
1472 1470 rp = nlp->list;
1473 1471 data_written = 0;
1474 1472 do {
1475 1473 /*
1476 1474 * Skip any requests which are already marked as having an
1477 1475 * error.
1478 1476 */
1479 1477 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1480 1478 rp = rp->list;
1481 1479 continue;
1482 1480 }
1483 1481
1484 1482 /*
1485 1483 * Count the number of iovec's which are required
1486 1484 * to handle this set of requests. One iovec is
1487 1485 * needed for each data buffer, whether addressed
1488 1486 * by wa_data or by the b_rptr pointers in the
1489 1487 * mblk chains.
1490 1488 */
1491 1489 iovcnt = 0;
1492 1490 lrp = rp;
1493 1491 for (;;) {
1494 1492 if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1495 1493 iovcnt++;
1496 1494 else {
1497 1495 m = lrp->wa->wa_mblk;
1498 1496 while (m != NULL) {
1499 1497 iovcnt++;
1500 1498 m = m->b_cont;
1501 1499 }
1502 1500 }
1503 1501 if (lrp->list == NULL ||
1504 1502 lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1505 1503 lrp->wa->wa_offset + lrp->wa->wa_count !=
1506 1504 lrp->list->wa->wa_offset) {
1507 1505 lrp = lrp->list;
1508 1506 break;
1509 1507 }
1510 1508 lrp = lrp->list;
1511 1509 }
1512 1510
1513 1511 if (iovcnt <= MAXCLIOVECS) {
1514 1512 #ifdef DEBUG
1515 1513 rfs_write_hits++;
1516 1514 #endif
1517 1515 niovp = iov;
1518 1516 } else {
1519 1517 #ifdef DEBUG
1520 1518 rfs_write_misses++;
1521 1519 #endif
1522 1520 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1523 1521 }
1524 1522 /*
1525 1523 * Put together the scatter/gather iovecs.
1526 1524 */
1527 1525 iovp = niovp;
1528 1526 trp = rp;
1529 1527 count = 0;
1530 1528 do {
1531 1529 if (trp->wa->wa_data || trp->wa->wa_rlist) {
1532 1530 if (trp->wa->wa_rlist) {
1533 1531 iovp->iov_base =
1534 1532 (char *)((trp->wa->wa_rlist)->
1535 1533 u.c_daddr3);
1536 1534 iovp->iov_len = trp->wa->wa_count;
1537 1535 } else {
1538 1536 iovp->iov_base = trp->wa->wa_data;
1539 1537 iovp->iov_len = trp->wa->wa_count;
1540 1538 }
1541 1539 iovp++;
1542 1540 } else {
1543 1541 m = trp->wa->wa_mblk;
1544 1542 rcount = trp->wa->wa_count;
1545 1543 while (m != NULL) {
1546 1544 iovp->iov_base = (caddr_t)m->b_rptr;
1547 1545 iovp->iov_len = (m->b_wptr - m->b_rptr);
1548 1546 rcount -= iovp->iov_len;
1549 1547 if (rcount < 0)
1550 1548 iovp->iov_len += rcount;
1551 1549 iovp++;
1552 1550 if (rcount <= 0)
1553 1551 break;
1554 1552 m = m->b_cont;
1555 1553 }
1556 1554 }
1557 1555 count += trp->wa->wa_count;
1558 1556 trp = trp->list;
1559 1557 } while (trp != lrp);
1560 1558
1561 1559 uio.uio_iov = niovp;
1562 1560 uio.uio_iovcnt = iovcnt;
1563 1561 uio.uio_segflg = UIO_SYSSPACE;
1564 1562 uio.uio_extflg = UIO_COPY_DEFAULT;
1565 1563 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1566 1564 uio.uio_resid = count;
1567 1565 /*
1568 1566 * The limit is checked on the client. We
1569 1567 * should allow any size writes here.
1570 1568 */
1571 1569 uio.uio_llimit = curproc->p_fsz_ctl;
1572 1570 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1573 1571 if (rlimit < (rlim64_t)uio.uio_resid)
1574 1572 uio.uio_resid = (uint_t)rlimit;
1575 1573
1576 1574 /*
1577 1575 * For now we assume no append mode.
1578 1576 */
1579 1577
1580 1578 /*
1581 1579 * We're changing creds because VM may fault
1582 1580 * and we need the cred of the current
1583 1581 * thread to be used if quota * checking is
1584 1582 * enabled.
1585 1583 */
1586 1584 savecred = curthread->t_cred;
1587 1585 curthread->t_cred = cr;
1588 1586 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1589 1587 curthread->t_cred = savecred;
1590 1588
1591 1589 /* check if a monitor detected a delegation conflict */
1592 1590 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1593 1591 /* mark as wouldblock so response is dropped */
1594 1592 curthread->t_flag |= T_WOULDBLOCK;
1595 1593
1596 1594 if (niovp != iov)
1597 1595 kmem_free(niovp, sizeof (*niovp) * iovcnt);
1598 1596
1599 1597 if (!error) {
1600 1598 data_written = 1;
1601 1599 /*
1602 1600 * Get attributes again so we send the latest mod
1603 1601 * time to the client side for his cache.
1604 1602 */
1605 1603 va.va_mask = AT_ALL; /* now we want everything */
1606 1604
1607 1605 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1608 1606
1609 1607 if (!error)
1610 1608 acl_perm(vp, exi, &va, rp->cr);
1611 1609 }
1612 1610
1613 1611 /*
1614 1612 * Fill in the status responses for each request
1615 1613 * which was just handled. Also, copy the latest
1616 1614 * attributes in to the attribute responses if
1617 1615 * appropriate.
1618 1616 */
1619 1617 t_flag = curthread->t_flag & T_WOULDBLOCK;
1620 1618 do {
1621 1619 rp->thread->t_flag |= t_flag;
1622 1620 /* check for overflows */
1623 1621 if (!error) {
1624 1622 error = vattr_to_nattr(&va, &rp->ns->ns_attr);
1625 1623 }
1626 1624 rp->ns->ns_status = puterrno(error);
1627 1625 rp = rp->list;
1628 1626 } while (rp != lrp);
1629 1627 } while (rp != NULL);
1630 1628
1631 1629 /*
1632 1630 * If any data was written at all, then we need to flush
1633 1631 * the data and metadata to stable storage.
1634 1632 */
1635 1633 if (data_written) {
1636 1634 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1637 1635
1638 1636 if (!error) {
1639 1637 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1640 1638 }
1641 1639 }
1642 1640
1643 1641 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1644 1642
1645 1643 if (in_crit)
1646 1644 nbl_end_crit(vp);
1647 1645 VN_RELE(vp);
1648 1646
1649 1647 t_flag = curthread->t_flag & T_WOULDBLOCK;
1650 1648 mutex_enter(&rfs_async_write_lock);
1651 1649 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1652 1650 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1653 1651 rp->ns->ns_status = puterrno(error);
1654 1652 rp->thread->t_flag |= t_flag;
1655 1653 }
1656 1654 }
1657 1655 cv_broadcast(&nlp->cv);
1658 1656 mutex_exit(&rfs_async_write_lock);
1659 1657
1660 1658 }
1661 1659
1662 1660 void *
1663 1661 rfs_write_getfh(struct nfswriteargs *wa)
1664 1662 {
1665 1663 return (&wa->wa_fhandle);
1666 1664 }
1667 1665
1668 1666 /*
1669 1667 * Create a file.
1670 1668 * Creates a file with given attributes and returns those attributes
1671 1669 * and an fhandle for the new file.
1672 1670 */
1673 1671 void
1674 1672 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1675 1673 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1676 1674 {
1677 1675 int error;
1678 1676 int lookuperr;
1679 1677 int in_crit = 0;
1680 1678 struct vattr va;
1681 1679 vnode_t *vp;
1682 1680 vnode_t *realvp;
1683 1681 vnode_t *dvp;
1684 1682 char *name = args->ca_da.da_name;
1685 1683 vnode_t *tvp = NULL;
1686 1684 int mode;
1687 1685 int lookup_ok;
1688 1686 bool_t trunc;
1689 1687 struct sockaddr *ca;
1690 1688
1691 1689 /*
1692 1690 * Disallow NULL paths
1693 1691 */
1694 1692 if (name == NULL || *name == '\0') {
1695 1693 dr->dr_status = NFSERR_ACCES;
1696 1694 return;
1697 1695 }
1698 1696
1699 1697 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1700 1698 if (dvp == NULL) {
1701 1699 dr->dr_status = NFSERR_STALE;
1702 1700 return;
1703 1701 }
1704 1702
1705 1703 error = sattr_to_vattr(args->ca_sa, &va);
1706 1704 if (error) {
1707 1705 dr->dr_status = puterrno(error);
1708 1706 return;
1709 1707 }
1710 1708
1711 1709 /*
1712 1710 * Must specify the mode.
1713 1711 */
1714 1712 if (!(va.va_mask & AT_MODE)) {
1715 1713 VN_RELE(dvp);
1716 1714 dr->dr_status = NFSERR_INVAL;
1717 1715 return;
1718 1716 }
1719 1717
1720 1718 /*
1721 1719 * This is a completely gross hack to make mknod
1722 1720 * work over the wire until we can wack the protocol
1723 1721 */
1724 1722 if ((va.va_mode & IFMT) == IFCHR) {
1725 1723 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1726 1724 va.va_type = VFIFO; /* xtra kludge for named pipe */
1727 1725 else {
1728 1726 va.va_type = VCHR;
1729 1727 /*
1730 1728 * uncompress the received dev_t
1731 1729 * if the top half is zero indicating a request
1732 1730 * from an `older style' OS.
1733 1731 */
1734 1732 if ((va.va_size & 0xffff0000) == 0)
1735 1733 va.va_rdev = nfsv2_expdev(va.va_size);
1736 1734 else
1737 1735 va.va_rdev = (dev_t)va.va_size;
1738 1736 }
1739 1737 va.va_mask &= ~AT_SIZE;
1740 1738 } else if ((va.va_mode & IFMT) == IFBLK) {
1741 1739 va.va_type = VBLK;
1742 1740 /*
1743 1741 * uncompress the received dev_t
1744 1742 * if the top half is zero indicating a request
1745 1743 * from an `older style' OS.
1746 1744 */
1747 1745 if ((va.va_size & 0xffff0000) == 0)
1748 1746 va.va_rdev = nfsv2_expdev(va.va_size);
1749 1747 else
1750 1748 va.va_rdev = (dev_t)va.va_size;
1751 1749 va.va_mask &= ~AT_SIZE;
1752 1750 } else if ((va.va_mode & IFMT) == IFSOCK) {
1753 1751 va.va_type = VSOCK;
1754 1752 } else {
1755 1753 va.va_type = VREG;
1756 1754 }
1757 1755 va.va_mode &= ~IFMT;
1758 1756 va.va_mask |= AT_TYPE;
1759 1757
1760 1758 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1761 1759 name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1762 1760 MAXPATHLEN);
1763 1761 if (name == NULL) {
1764 1762 dr->dr_status = puterrno(EINVAL);
1765 1763 return;
1766 1764 }
1767 1765
1768 1766 /*
1769 1767 * Why was the choice made to use VWRITE as the mode to the
1770 1768 * call to VOP_CREATE ? This results in a bug. When a client
1771 1769 * opens a file that already exists and is RDONLY, the second
1772 1770 * open fails with an EACESS because of the mode.
1773 1771 * bug ID 1054648.
1774 1772 */
1775 1773 lookup_ok = 0;
1776 1774 mode = VWRITE;
1777 1775 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1778 1776 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1779 1777 NULL, NULL, NULL);
1780 1778 if (!error) {
1781 1779 struct vattr at;
1782 1780
1783 1781 lookup_ok = 1;
1784 1782 at.va_mask = AT_MODE;
1785 1783 error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1786 1784 if (!error)
1787 1785 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1788 1786 VN_RELE(tvp);
1789 1787 tvp = NULL;
1790 1788 }
1791 1789 }
1792 1790
1793 1791 if (!lookup_ok) {
1794 1792 if (rdonly(ro, dvp)) {
1795 1793 error = EROFS;
1796 1794 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1797 1795 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1798 1796 error = EPERM;
1799 1797 } else {
1800 1798 error = 0;
1801 1799 }
1802 1800 }
1803 1801
1804 1802 /*
1805 1803 * If file size is being modified on an already existing file
1806 1804 * make sure that there are no conflicting non-blocking mandatory
1807 1805 * locks in the region being manipulated. Return EACCES if there
1808 1806 * are conflicting locks.
1809 1807 */
1810 1808 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1811 1809 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1812 1810 NULL, NULL, NULL);
1813 1811
1814 1812 if (!lookuperr &&
1815 1813 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1816 1814 VN_RELE(tvp);
1817 1815 curthread->t_flag |= T_WOULDBLOCK;
1818 1816 goto out;
1819 1817 }
1820 1818
1821 1819 if (!lookuperr && nbl_need_check(tvp)) {
1822 1820 /*
1823 1821 * The file exists. Now check if it has any
1824 1822 * conflicting non-blocking mandatory locks
1825 1823 * in the region being changed.
1826 1824 */
1827 1825 struct vattr bva;
1828 1826 u_offset_t offset;
1829 1827 ssize_t length;
1830 1828
1831 1829 nbl_start_crit(tvp, RW_READER);
1832 1830 in_crit = 1;
1833 1831
1834 1832 bva.va_mask = AT_SIZE;
1835 1833 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1836 1834 if (!error) {
1837 1835 if (va.va_size < bva.va_size) {
1838 1836 offset = va.va_size;
1839 1837 length = bva.va_size - va.va_size;
1840 1838 } else {
1841 1839 offset = bva.va_size;
1842 1840 length = va.va_size - bva.va_size;
1843 1841 }
1844 1842 if (length) {
1845 1843 if (nbl_conflict(tvp, NBL_WRITE,
1846 1844 offset, length, 0, NULL)) {
1847 1845 error = EACCES;
1848 1846 }
1849 1847 }
1850 1848 }
1851 1849 if (error) {
1852 1850 nbl_end_crit(tvp);
1853 1851 VN_RELE(tvp);
1854 1852 in_crit = 0;
1855 1853 }
1856 1854 } else if (tvp != NULL) {
1857 1855 VN_RELE(tvp);
1858 1856 }
1859 1857 }
1860 1858
1861 1859 if (!error) {
1862 1860 /*
1863 1861 * If filesystem is shared with nosuid the remove any
1864 1862 * setuid/setgid bits on create.
1865 1863 */
1866 1864 if (va.va_type == VREG &&
1867 1865 exi->exi_export.ex_flags & EX_NOSUID)
1868 1866 va.va_mode &= ~(VSUID | VSGID);
1869 1867
1870 1868 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1871 1869 NULL, NULL);
1872 1870
1873 1871 if (!error) {
1874 1872
1875 1873 if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1876 1874 trunc = TRUE;
1877 1875 else
1878 1876 trunc = FALSE;
1879 1877
1880 1878 if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1881 1879 VN_RELE(vp);
1882 1880 curthread->t_flag |= T_WOULDBLOCK;
1883 1881 goto out;
1884 1882 }
1885 1883 va.va_mask = AT_ALL;
1886 1884
1887 1885 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1888 1886
1889 1887 /* check for overflows */
1890 1888 if (!error) {
1891 1889 acl_perm(vp, exi, &va, cr);
1892 1890 error = vattr_to_nattr(&va, &dr->dr_attr);
1893 1891 if (!error) {
1894 1892 error = makefh(&dr->dr_fhandle, vp,
1895 1893 exi);
1896 1894 }
1897 1895 }
1898 1896 /*
1899 1897 * Force modified metadata out to stable storage.
1900 1898 *
1901 1899 * if a underlying vp exists, pass it to VOP_FSYNC
1902 1900 */
1903 1901 if (VOP_REALVP(vp, &realvp, NULL) == 0)
1904 1902 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1905 1903 else
1906 1904 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1907 1905 VN_RELE(vp);
1908 1906 }
1909 1907
1910 1908 if (in_crit) {
1911 1909 nbl_end_crit(tvp);
1912 1910 VN_RELE(tvp);
1913 1911 }
1914 1912 }
1915 1913
1916 1914 /*
1917 1915 * Force modified data and metadata out to stable storage.
1918 1916 */
1919 1917 (void) VOP_FSYNC(dvp, 0, cr, NULL);
1920 1918
1921 1919 out:
1922 1920
1923 1921 VN_RELE(dvp);
1924 1922
1925 1923 dr->dr_status = puterrno(error);
1926 1924
1927 1925 if (name != args->ca_da.da_name)
1928 1926 kmem_free(name, MAXPATHLEN);
1929 1927 }
1930 1928 void *
1931 1929 rfs_create_getfh(struct nfscreatargs *args)
1932 1930 {
1933 1931 return (args->ca_da.da_fhandle);
1934 1932 }
1935 1933
1936 1934 /*
1937 1935 * Remove a file.
1938 1936 * Remove named file from parent directory.
1939 1937 */
1940 1938 /* ARGSUSED */
1941 1939 void
1942 1940 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1943 1941 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1944 1942 {
1945 1943 int error = 0;
1946 1944 vnode_t *vp;
1947 1945 vnode_t *targvp;
1948 1946 int in_crit = 0;
1949 1947
1950 1948 /*
1951 1949 * Disallow NULL paths
1952 1950 */
1953 1951 if (da->da_name == NULL || *da->da_name == '\0') {
1954 1952 *status = NFSERR_ACCES;
1955 1953 return;
1956 1954 }
1957 1955
1958 1956 vp = nfs_fhtovp(da->da_fhandle, exi);
1959 1957 if (vp == NULL) {
1960 1958 *status = NFSERR_STALE;
1961 1959 return;
1962 1960 }
1963 1961
1964 1962 if (rdonly(ro, vp)) {
1965 1963 VN_RELE(vp);
1966 1964 *status = NFSERR_ROFS;
1967 1965 return;
1968 1966 }
1969 1967
1970 1968 /*
1971 1969 * Check for a conflict with a non-blocking mandatory share reservation.
1972 1970 */
1973 1971 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1974 1972 NULL, cr, NULL, NULL, NULL);
1975 1973 if (error != 0) {
1976 1974 VN_RELE(vp);
1977 1975 *status = puterrno(error);
1978 1976 return;
1979 1977 }
1980 1978
1981 1979 /*
1982 1980 * If the file is delegated to an v4 client, then initiate
1983 1981 * recall and drop this request (by setting T_WOULDBLOCK).
1984 1982 * The client will eventually re-transmit the request and
1985 1983 * (hopefully), by then, the v4 client will have returned
1986 1984 * the delegation.
1987 1985 */
1988 1986
1989 1987 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1990 1988 VN_RELE(vp);
1991 1989 VN_RELE(targvp);
1992 1990 curthread->t_flag |= T_WOULDBLOCK;
1993 1991 return;
1994 1992 }
1995 1993
1996 1994 if (nbl_need_check(targvp)) {
1997 1995 nbl_start_crit(targvp, RW_READER);
1998 1996 in_crit = 1;
1999 1997 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2000 1998 error = EACCES;
2001 1999 goto out;
2002 2000 }
2003 2001 }
2004 2002
2005 2003 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2006 2004
2007 2005 /*
2008 2006 * Force modified data and metadata out to stable storage.
2009 2007 */
2010 2008 (void) VOP_FSYNC(vp, 0, cr, NULL);
2011 2009
2012 2010 out:
2013 2011 if (in_crit)
2014 2012 nbl_end_crit(targvp);
2015 2013 VN_RELE(targvp);
2016 2014 VN_RELE(vp);
2017 2015
2018 2016 *status = puterrno(error);
2019 2017
2020 2018 }
2021 2019
2022 2020 void *
2023 2021 rfs_remove_getfh(struct nfsdiropargs *da)
2024 2022 {
2025 2023 return (da->da_fhandle);
2026 2024 }
2027 2025
2028 2026 /*
2029 2027 * rename a file
2030 2028 * Give a file (from) a new name (to).
2031 2029 */
2032 2030 /* ARGSUSED */
2033 2031 void
2034 2032 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2035 2033 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2036 2034 {
2037 2035 int error = 0;
2038 2036 vnode_t *fromvp;
2039 2037 vnode_t *tovp;
2040 2038 struct exportinfo *to_exi;
2041 2039 fhandle_t *fh;
2042 2040 vnode_t *srcvp;
2043 2041 vnode_t *targvp;
2044 2042 int in_crit = 0;
2045 2043
2046 2044 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2047 2045 if (fromvp == NULL) {
2048 2046 *status = NFSERR_STALE;
2049 2047 return;
2050 2048 }
2051 2049
2052 2050 fh = args->rna_to.da_fhandle;
2053 2051 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2054 2052 if (to_exi == NULL) {
2055 2053 VN_RELE(fromvp);
2056 2054 *status = NFSERR_ACCES;
2057 2055 return;
2058 2056 }
2059 2057 exi_rele(to_exi);
2060 2058
2061 2059 if (to_exi != exi) {
2062 2060 VN_RELE(fromvp);
2063 2061 *status = NFSERR_XDEV;
2064 2062 return;
2065 2063 }
2066 2064
2067 2065 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2068 2066 if (tovp == NULL) {
2069 2067 VN_RELE(fromvp);
2070 2068 *status = NFSERR_STALE;
2071 2069 return;
2072 2070 }
2073 2071
2074 2072 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2075 2073 VN_RELE(tovp);
2076 2074 VN_RELE(fromvp);
2077 2075 *status = NFSERR_NOTDIR;
2078 2076 return;
2079 2077 }
2080 2078
2081 2079 /*
2082 2080 * Disallow NULL paths
2083 2081 */
2084 2082 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2085 2083 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2086 2084 VN_RELE(tovp);
2087 2085 VN_RELE(fromvp);
2088 2086 *status = NFSERR_ACCES;
2089 2087 return;
2090 2088 }
2091 2089
2092 2090 if (rdonly(ro, tovp)) {
2093 2091 VN_RELE(tovp);
2094 2092 VN_RELE(fromvp);
2095 2093 *status = NFSERR_ROFS;
2096 2094 return;
2097 2095 }
2098 2096
2099 2097 /*
2100 2098 * Check for a conflict with a non-blocking mandatory share reservation.
2101 2099 */
2102 2100 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2103 2101 NULL, cr, NULL, NULL, NULL);
2104 2102 if (error != 0) {
2105 2103 VN_RELE(tovp);
2106 2104 VN_RELE(fromvp);
2107 2105 *status = puterrno(error);
2108 2106 return;
2109 2107 }
2110 2108
2111 2109 /* Check for delegations on the source file */
2112 2110
2113 2111 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2114 2112 VN_RELE(tovp);
2115 2113 VN_RELE(fromvp);
2116 2114 VN_RELE(srcvp);
2117 2115 curthread->t_flag |= T_WOULDBLOCK;
2118 2116 return;
2119 2117 }
2120 2118
2121 2119 /* Check for delegation on the file being renamed over, if it exists */
2122 2120
2123 2121 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2124 2122 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2125 2123 NULL, NULL, NULL) == 0) {
2126 2124
2127 2125 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2128 2126 VN_RELE(tovp);
2129 2127 VN_RELE(fromvp);
2130 2128 VN_RELE(srcvp);
2131 2129 VN_RELE(targvp);
2132 2130 curthread->t_flag |= T_WOULDBLOCK;
2133 2131 return;
2134 2132 }
2135 2133 VN_RELE(targvp);
2136 2134 }
2137 2135
2138 2136
2139 2137 if (nbl_need_check(srcvp)) {
2140 2138 nbl_start_crit(srcvp, RW_READER);
2141 2139 in_crit = 1;
2142 2140 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2143 2141 error = EACCES;
2144 2142 goto out;
2145 2143 }
2146 2144 }
2147 2145
2148 2146 error = VOP_RENAME(fromvp, args->rna_from.da_name,
2149 2147 tovp, args->rna_to.da_name, cr, NULL, 0);
2150 2148
2151 2149 if (error == 0)
2152 2150 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2153 2151 strlen(args->rna_to.da_name));
2154 2152
2155 2153 /*
2156 2154 * Force modified data and metadata out to stable storage.
2157 2155 */
2158 2156 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2159 2157 (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2160 2158
2161 2159 out:
2162 2160 if (in_crit)
2163 2161 nbl_end_crit(srcvp);
2164 2162 VN_RELE(srcvp);
2165 2163 VN_RELE(tovp);
2166 2164 VN_RELE(fromvp);
2167 2165
2168 2166 *status = puterrno(error);
2169 2167
2170 2168 }
2171 2169 void *
2172 2170 rfs_rename_getfh(struct nfsrnmargs *args)
2173 2171 {
2174 2172 return (args->rna_from.da_fhandle);
2175 2173 }
2176 2174
2177 2175 /*
2178 2176 * Link to a file.
2179 2177 * Create a file (to) which is a hard link to the given file (from).
2180 2178 */
2181 2179 /* ARGSUSED */
2182 2180 void
2183 2181 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2184 2182 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2185 2183 {
2186 2184 int error;
2187 2185 vnode_t *fromvp;
2188 2186 vnode_t *tovp;
2189 2187 struct exportinfo *to_exi;
2190 2188 fhandle_t *fh;
2191 2189
2192 2190 fromvp = nfs_fhtovp(args->la_from, exi);
2193 2191 if (fromvp == NULL) {
2194 2192 *status = NFSERR_STALE;
2195 2193 return;
2196 2194 }
2197 2195
2198 2196 fh = args->la_to.da_fhandle;
2199 2197 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2200 2198 if (to_exi == NULL) {
2201 2199 VN_RELE(fromvp);
2202 2200 *status = NFSERR_ACCES;
2203 2201 return;
2204 2202 }
2205 2203 exi_rele(to_exi);
2206 2204
2207 2205 if (to_exi != exi) {
2208 2206 VN_RELE(fromvp);
2209 2207 *status = NFSERR_XDEV;
2210 2208 return;
2211 2209 }
2212 2210
2213 2211 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2214 2212 if (tovp == NULL) {
2215 2213 VN_RELE(fromvp);
2216 2214 *status = NFSERR_STALE;
2217 2215 return;
2218 2216 }
2219 2217
2220 2218 if (tovp->v_type != VDIR) {
2221 2219 VN_RELE(tovp);
2222 2220 VN_RELE(fromvp);
2223 2221 *status = NFSERR_NOTDIR;
2224 2222 return;
2225 2223 }
2226 2224 /*
2227 2225 * Disallow NULL paths
2228 2226 */
2229 2227 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2230 2228 VN_RELE(tovp);
2231 2229 VN_RELE(fromvp);
2232 2230 *status = NFSERR_ACCES;
2233 2231 return;
2234 2232 }
2235 2233
2236 2234 if (rdonly(ro, tovp)) {
2237 2235 VN_RELE(tovp);
2238 2236 VN_RELE(fromvp);
2239 2237 *status = NFSERR_ROFS;
2240 2238 return;
2241 2239 }
2242 2240
2243 2241 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2244 2242
2245 2243 /*
2246 2244 * Force modified data and metadata out to stable storage.
2247 2245 */
2248 2246 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2249 2247 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2250 2248
2251 2249 VN_RELE(tovp);
2252 2250 VN_RELE(fromvp);
2253 2251
2254 2252 *status = puterrno(error);
2255 2253
2256 2254 }
2257 2255 void *
2258 2256 rfs_link_getfh(struct nfslinkargs *args)
2259 2257 {
2260 2258 return (args->la_from);
2261 2259 }
2262 2260
2263 2261 /*
2264 2262 * Symbolicly link to a file.
2265 2263 * Create a file (to) with the given attributes which is a symbolic link
2266 2264 * to the given path name (to).
2267 2265 */
2268 2266 void
2269 2267 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2270 2268 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2271 2269 {
2272 2270 int error;
2273 2271 struct vattr va;
2274 2272 vnode_t *vp;
2275 2273 vnode_t *svp;
2276 2274 int lerror;
2277 2275 struct sockaddr *ca;
2278 2276 char *name = NULL;
2279 2277
2280 2278 /*
2281 2279 * Disallow NULL paths
2282 2280 */
2283 2281 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2284 2282 *status = NFSERR_ACCES;
2285 2283 return;
2286 2284 }
2287 2285
2288 2286 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2289 2287 if (vp == NULL) {
2290 2288 *status = NFSERR_STALE;
2291 2289 return;
2292 2290 }
2293 2291
2294 2292 if (rdonly(ro, vp)) {
2295 2293 VN_RELE(vp);
2296 2294 *status = NFSERR_ROFS;
2297 2295 return;
2298 2296 }
2299 2297
2300 2298 error = sattr_to_vattr(args->sla_sa, &va);
2301 2299 if (error) {
2302 2300 VN_RELE(vp);
2303 2301 *status = puterrno(error);
2304 2302 return;
2305 2303 }
2306 2304
2307 2305 if (!(va.va_mask & AT_MODE)) {
2308 2306 VN_RELE(vp);
2309 2307 *status = NFSERR_INVAL;
2310 2308 return;
2311 2309 }
2312 2310
2313 2311 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2314 2312 name = nfscmd_convname(ca, exi, args->sla_tnm,
2315 2313 NFSCMD_CONV_INBOUND, MAXPATHLEN);
2316 2314
2317 2315 if (name == NULL) {
2318 2316 *status = NFSERR_ACCES;
2319 2317 return;
2320 2318 }
2321 2319
2322 2320 va.va_type = VLNK;
2323 2321 va.va_mask |= AT_TYPE;
2324 2322
2325 2323 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2326 2324
2327 2325 /*
2328 2326 * Force new data and metadata out to stable storage.
2329 2327 */
2330 2328 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2331 2329 NULL, cr, NULL, NULL, NULL);
2332 2330
2333 2331 if (!lerror) {
2334 2332 (void) VOP_FSYNC(svp, 0, cr, NULL);
2335 2333 VN_RELE(svp);
2336 2334 }
2337 2335
2338 2336 /*
2339 2337 * Force modified data and metadata out to stable storage.
2340 2338 */
2341 2339 (void) VOP_FSYNC(vp, 0, cr, NULL);
2342 2340
2343 2341 VN_RELE(vp);
2344 2342
2345 2343 *status = puterrno(error);
2346 2344 if (name != args->sla_tnm)
2347 2345 kmem_free(name, MAXPATHLEN);
2348 2346
2349 2347 }
2350 2348 void *
2351 2349 rfs_symlink_getfh(struct nfsslargs *args)
2352 2350 {
2353 2351 return (args->sla_from.da_fhandle);
2354 2352 }
2355 2353
2356 2354 /*
2357 2355 * Make a directory.
2358 2356 * Create a directory with the given name, parent directory, and attributes.
2359 2357 * Returns a file handle and attributes for the new directory.
2360 2358 */
2361 2359 /* ARGSUSED */
2362 2360 void
2363 2361 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2364 2362 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2365 2363 {
2366 2364 int error;
2367 2365 struct vattr va;
2368 2366 vnode_t *dvp = NULL;
2369 2367 vnode_t *vp;
2370 2368 char *name = args->ca_da.da_name;
2371 2369
2372 2370 /*
2373 2371 * Disallow NULL paths
2374 2372 */
2375 2373 if (name == NULL || *name == '\0') {
2376 2374 dr->dr_status = NFSERR_ACCES;
2377 2375 return;
2378 2376 }
2379 2377
2380 2378 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2381 2379 if (vp == NULL) {
2382 2380 dr->dr_status = NFSERR_STALE;
2383 2381 return;
2384 2382 }
2385 2383
2386 2384 if (rdonly(ro, vp)) {
2387 2385 VN_RELE(vp);
2388 2386 dr->dr_status = NFSERR_ROFS;
2389 2387 return;
2390 2388 }
2391 2389
2392 2390 error = sattr_to_vattr(args->ca_sa, &va);
2393 2391 if (error) {
2394 2392 VN_RELE(vp);
2395 2393 dr->dr_status = puterrno(error);
2396 2394 return;
2397 2395 }
2398 2396
2399 2397 if (!(va.va_mask & AT_MODE)) {
2400 2398 VN_RELE(vp);
2401 2399 dr->dr_status = NFSERR_INVAL;
2402 2400 return;
2403 2401 }
2404 2402
2405 2403 va.va_type = VDIR;
2406 2404 va.va_mask |= AT_TYPE;
2407 2405
2408 2406 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2409 2407
2410 2408 if (!error) {
2411 2409 /*
2412 2410 * Attribtutes of the newly created directory should
2413 2411 * be returned to the client.
2414 2412 */
2415 2413 va.va_mask = AT_ALL; /* We want everything */
2416 2414 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2417 2415
2418 2416 /* check for overflows */
2419 2417 if (!error) {
2420 2418 acl_perm(vp, exi, &va, cr);
2421 2419 error = vattr_to_nattr(&va, &dr->dr_attr);
2422 2420 if (!error) {
2423 2421 error = makefh(&dr->dr_fhandle, dvp, exi);
2424 2422 }
2425 2423 }
2426 2424 /*
2427 2425 * Force new data and metadata out to stable storage.
2428 2426 */
2429 2427 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2430 2428 VN_RELE(dvp);
2431 2429 }
2432 2430
2433 2431 /*
2434 2432 * Force modified data and metadata out to stable storage.
2435 2433 */
2436 2434 (void) VOP_FSYNC(vp, 0, cr, NULL);
2437 2435
2438 2436 VN_RELE(vp);
2439 2437
2440 2438 dr->dr_status = puterrno(error);
2441 2439
2442 2440 }
2443 2441 void *
2444 2442 rfs_mkdir_getfh(struct nfscreatargs *args)
2445 2443 {
2446 2444 return (args->ca_da.da_fhandle);
2447 2445 }
2448 2446
2449 2447 /*
2450 2448 * Remove a directory.
2451 2449 * Remove the given directory name from the given parent directory.
2452 2450 */
2453 2451 /* ARGSUSED */
2454 2452 void
2455 2453 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2456 2454 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2457 2455 {
2458 2456 int error;
2459 2457 vnode_t *vp;
2460 2458
2461 2459 /*
2462 2460 * Disallow NULL paths
2463 2461 */
2464 2462 if (da->da_name == NULL || *da->da_name == '\0') {
2465 2463 *status = NFSERR_ACCES;
2466 2464 return;
2467 2465 }
2468 2466
2469 2467 vp = nfs_fhtovp(da->da_fhandle, exi);
2470 2468 if (vp == NULL) {
2471 2469 *status = NFSERR_STALE;
2472 2470 return;
2473 2471 }
2474 2472
2475 2473 if (rdonly(ro, vp)) {
2476 2474 VN_RELE(vp);
2477 2475 *status = NFSERR_ROFS;
2478 2476 return;
2479 2477 }
2480 2478
2481 2479 /*
2482 2480 * VOP_RMDIR takes a third argument (the current
2483 2481 * directory of the process). That's because someone
2484 2482 * wants to return EINVAL if one tries to remove ".".
2485 2483 * Of course, NFS servers have no idea what their
2486 2484 * clients' current directories are. We fake it by
2487 2485 * supplying a vnode known to exist and illegal to
2488 2486 * remove.
2489 2487 */
2490 2488 error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2491 2489
2492 2490 /*
2493 2491 * Force modified data and metadata out to stable storage.
2494 2492 */
2495 2493 (void) VOP_FSYNC(vp, 0, cr, NULL);
2496 2494
2497 2495 VN_RELE(vp);
2498 2496
2499 2497 /*
2500 2498 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2501 2499 * if the directory is not empty. A System V NFS server
2502 2500 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2503 2501 * over the wire.
2504 2502 */
2505 2503 if (error == EEXIST)
2506 2504 *status = NFSERR_NOTEMPTY;
2507 2505 else
2508 2506 *status = puterrno(error);
2509 2507
2510 2508 }
2511 2509 void *
2512 2510 rfs_rmdir_getfh(struct nfsdiropargs *da)
2513 2511 {
2514 2512 return (da->da_fhandle);
2515 2513 }
2516 2514
2517 2515 /* ARGSUSED */
2518 2516 void
2519 2517 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2520 2518 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2521 2519 {
2522 2520 int error;
2523 2521 int iseof;
2524 2522 struct iovec iov;
2525 2523 struct uio uio;
2526 2524 vnode_t *vp;
2527 2525 char *ndata = NULL;
2528 2526 struct sockaddr *ca;
2529 2527 size_t nents;
2530 2528 int ret;
2531 2529
2532 2530 vp = nfs_fhtovp(&rda->rda_fh, exi);
2533 2531 if (vp == NULL) {
2534 2532 rd->rd_entries = NULL;
2535 2533 rd->rd_status = NFSERR_STALE;
2536 2534 return;
2537 2535 }
2538 2536
2539 2537 if (vp->v_type != VDIR) {
2540 2538 VN_RELE(vp);
2541 2539 rd->rd_entries = NULL;
2542 2540 rd->rd_status = NFSERR_NOTDIR;
2543 2541 return;
2544 2542 }
2545 2543
2546 2544 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2547 2545
2548 2546 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2549 2547
2550 2548 if (error) {
2551 2549 rd->rd_entries = NULL;
2552 2550 goto bad;
2553 2551 }
2554 2552
2555 2553 if (rda->rda_count == 0) {
2556 2554 rd->rd_entries = NULL;
2557 2555 rd->rd_size = 0;
2558 2556 rd->rd_eof = FALSE;
2559 2557 goto bad;
2560 2558 }
2561 2559
2562 2560 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2563 2561
2564 2562 /*
2565 2563 * Allocate data for entries. This will be freed by rfs_rddirfree.
2566 2564 */
2567 2565 rd->rd_bufsize = (uint_t)rda->rda_count;
2568 2566 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2569 2567
2570 2568 /*
2571 2569 * Set up io vector to read directory data
2572 2570 */
2573 2571 iov.iov_base = (caddr_t)rd->rd_entries;
2574 2572 iov.iov_len = rda->rda_count;
2575 2573 uio.uio_iov = &iov;
2576 2574 uio.uio_iovcnt = 1;
2577 2575 uio.uio_segflg = UIO_SYSSPACE;
2578 2576 uio.uio_extflg = UIO_COPY_CACHED;
2579 2577 uio.uio_loffset = (offset_t)rda->rda_offset;
2580 2578 uio.uio_resid = rda->rda_count;
2581 2579
2582 2580 /*
2583 2581 * read directory
2584 2582 */
2585 2583 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2586 2584
2587 2585 /*
2588 2586 * Clean up
2589 2587 */
2590 2588 if (!error) {
2591 2589 /*
2592 2590 * set size and eof
2593 2591 */
2594 2592 if (uio.uio_resid == rda->rda_count) {
2595 2593 rd->rd_size = 0;
2596 2594 rd->rd_eof = TRUE;
2597 2595 } else {
2598 2596 rd->rd_size = (uint32_t)(rda->rda_count -
2599 2597 uio.uio_resid);
2600 2598 rd->rd_eof = iseof ? TRUE : FALSE;
2601 2599 }
2602 2600 }
2603 2601
2604 2602 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2605 2603 nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2606 2604 ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2607 2605 rda->rda_count, &ndata);
2608 2606
2609 2607 if (ret != 0) {
2610 2608 size_t dropbytes;
2611 2609 /*
2612 2610 * We had to drop one or more entries in order to fit
2613 2611 * during the character conversion. We need to patch
2614 2612 * up the size and eof info.
2615 2613 */
2616 2614 if (rd->rd_eof)
2617 2615 rd->rd_eof = FALSE;
2618 2616 dropbytes = nfscmd_dropped_entrysize(
2619 2617 (struct dirent64 *)rd->rd_entries, nents, ret);
2620 2618 rd->rd_size -= dropbytes;
2621 2619 }
2622 2620 if (ndata == NULL) {
2623 2621 ndata = (char *)rd->rd_entries;
2624 2622 } else if (ndata != (char *)rd->rd_entries) {
2625 2623 kmem_free(rd->rd_entries, rd->rd_bufsize);
2626 2624 rd->rd_entries = (void *)ndata;
2627 2625 rd->rd_bufsize = rda->rda_count;
2628 2626 }
2629 2627
2630 2628 bad:
2631 2629 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2632 2630
2633 2631 #if 0 /* notyet */
2634 2632 /*
2635 2633 * Don't do this. It causes local disk writes when just
2636 2634 * reading the file and the overhead is deemed larger
2637 2635 * than the benefit.
2638 2636 */
2639 2637 /*
2640 2638 * Force modified metadata out to stable storage.
2641 2639 */
2642 2640 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2643 2641 #endif
2644 2642
2645 2643 VN_RELE(vp);
2646 2644
2647 2645 rd->rd_status = puterrno(error);
2648 2646
2649 2647 }
2650 2648 void *
2651 2649 rfs_readdir_getfh(struct nfsrddirargs *rda)
2652 2650 {
2653 2651 return (&rda->rda_fh);
2654 2652 }
2655 2653 void
2656 2654 rfs_rddirfree(struct nfsrddirres *rd)
2657 2655 {
2658 2656 if (rd->rd_entries != NULL)
2659 2657 kmem_free(rd->rd_entries, rd->rd_bufsize);
2660 2658 }
2661 2659
2662 2660 /* ARGSUSED */
2663 2661 void
2664 2662 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2665 2663 struct svc_req *req, cred_t *cr, bool_t ro)
2666 2664 {
2667 2665 int error;
2668 2666 struct statvfs64 sb;
2669 2667 vnode_t *vp;
2670 2668
2671 2669 vp = nfs_fhtovp(fh, exi);
2672 2670 if (vp == NULL) {
2673 2671 fs->fs_status = NFSERR_STALE;
2674 2672 return;
2675 2673 }
2676 2674
2677 2675 error = VFS_STATVFS(vp->v_vfsp, &sb);
2678 2676
2679 2677 if (!error) {
2680 2678 fs->fs_tsize = nfstsize();
2681 2679 fs->fs_bsize = sb.f_frsize;
2682 2680 fs->fs_blocks = sb.f_blocks;
2683 2681 fs->fs_bfree = sb.f_bfree;
2684 2682 fs->fs_bavail = sb.f_bavail;
2685 2683 }
2686 2684
2687 2685 VN_RELE(vp);
2688 2686
2689 2687 fs->fs_status = puterrno(error);
2690 2688
2691 2689 }
2692 2690 void *
2693 2691 rfs_statfs_getfh(fhandle_t *fh)
2694 2692 {
2695 2693 return (fh);
2696 2694 }
2697 2695
2698 2696 static int
2699 2697 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2700 2698 {
2701 2699 vap->va_mask = 0;
2702 2700
2703 2701 /*
2704 2702 * There was a sign extension bug in some VFS based systems
2705 2703 * which stored the mode as a short. When it would get
2706 2704 * assigned to a u_long, no sign extension would occur.
2707 2705 * It needed to, but this wasn't noticed because sa_mode
2708 2706 * would then get assigned back to the short, thus ignoring
2709 2707 * the upper 16 bits of sa_mode.
2710 2708 *
2711 2709 * To make this implementation work for both broken
2712 2710 * clients and good clients, we check for both versions
2713 2711 * of the mode.
2714 2712 */
2715 2713 if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2716 2714 sa->sa_mode != (uint32_t)-1) {
2717 2715 vap->va_mask |= AT_MODE;
2718 2716 vap->va_mode = sa->sa_mode;
2719 2717 }
2720 2718 if (sa->sa_uid != (uint32_t)-1) {
2721 2719 vap->va_mask |= AT_UID;
2722 2720 vap->va_uid = sa->sa_uid;
2723 2721 }
2724 2722 if (sa->sa_gid != (uint32_t)-1) {
2725 2723 vap->va_mask |= AT_GID;
2726 2724 vap->va_gid = sa->sa_gid;
2727 2725 }
2728 2726 if (sa->sa_size != (uint32_t)-1) {
2729 2727 vap->va_mask |= AT_SIZE;
2730 2728 vap->va_size = sa->sa_size;
2731 2729 }
2732 2730 if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2733 2731 sa->sa_atime.tv_usec != (int32_t)-1) {
2734 2732 #ifndef _LP64
2735 2733 /* return error if time overflow */
2736 2734 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2737 2735 return (EOVERFLOW);
2738 2736 #endif
2739 2737 vap->va_mask |= AT_ATIME;
2740 2738 /*
2741 2739 * nfs protocol defines times as unsigned so don't extend sign,
2742 2740 * unless sysadmin set nfs_allow_preepoch_time.
2743 2741 */
2744 2742 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2745 2743 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2746 2744 }
2747 2745 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2748 2746 sa->sa_mtime.tv_usec != (int32_t)-1) {
2749 2747 #ifndef _LP64
2750 2748 /* return error if time overflow */
2751 2749 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2752 2750 return (EOVERFLOW);
2753 2751 #endif
2754 2752 vap->va_mask |= AT_MTIME;
2755 2753 /*
2756 2754 * nfs protocol defines times as unsigned so don't extend sign,
2757 2755 * unless sysadmin set nfs_allow_preepoch_time.
2758 2756 */
2759 2757 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2760 2758 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2761 2759 }
2762 2760 return (0);
2763 2761 }
2764 2762
2765 2763 static enum nfsftype vt_to_nf[] = {
2766 2764 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2767 2765 };
2768 2766
2769 2767 /*
2770 2768 * check the following fields for overflow: nodeid, size, and time.
2771 2769 * There could be a problem when converting 64-bit LP64 fields
2772 2770 * into 32-bit ones. Return an error if there is an overflow.
2773 2771 */
2774 2772 int
2775 2773 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2776 2774 {
2777 2775 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2778 2776 na->na_type = vt_to_nf[vap->va_type];
2779 2777
2780 2778 if (vap->va_mode == (unsigned short) -1)
2781 2779 na->na_mode = (uint32_t)-1;
2782 2780 else
2783 2781 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2784 2782
2785 2783 if (vap->va_uid == (unsigned short)(-1))
2786 2784 na->na_uid = (uint32_t)(-1);
2787 2785 else if (vap->va_uid == UID_NOBODY)
2788 2786 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2789 2787 else
2790 2788 na->na_uid = vap->va_uid;
2791 2789
2792 2790 if (vap->va_gid == (unsigned short)(-1))
2793 2791 na->na_gid = (uint32_t)-1;
2794 2792 else if (vap->va_gid == GID_NOBODY)
2795 2793 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2796 2794 else
2797 2795 na->na_gid = vap->va_gid;
2798 2796
2799 2797 /*
2800 2798 * Do we need to check fsid for overflow? It is 64-bit in the
2801 2799 * vattr, but are bigger than 32 bit values supported?
2802 2800 */
2803 2801 na->na_fsid = vap->va_fsid;
2804 2802
2805 2803 na->na_nodeid = vap->va_nodeid;
2806 2804
2807 2805 /*
2808 2806 * Check to make sure that the nodeid is representable over the
2809 2807 * wire without losing bits.
2810 2808 */
2811 2809 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2812 2810 return (EFBIG);
2813 2811 na->na_nlink = vap->va_nlink;
2814 2812
2815 2813 /*
2816 2814 * Check for big files here, instead of at the caller. See
2817 2815 * comments in cstat for large special file explanation.
2818 2816 */
2819 2817 if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2820 2818 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2821 2819 return (EFBIG);
2822 2820 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2823 2821 /* UNKNOWN_SIZE | OVERFLOW */
2824 2822 na->na_size = MAXOFF32_T;
2825 2823 } else
2826 2824 na->na_size = vap->va_size;
2827 2825 } else
2828 2826 na->na_size = vap->va_size;
2829 2827
2830 2828 /*
2831 2829 * If the vnode times overflow the 32-bit times that NFS2
2832 2830 * uses on the wire then return an error.
2833 2831 */
2834 2832 if (!NFS_VAP_TIME_OK(vap)) {
2835 2833 return (EOVERFLOW);
2836 2834 }
2837 2835 na->na_atime.tv_sec = vap->va_atime.tv_sec;
2838 2836 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2839 2837
2840 2838 na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2841 2839 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2842 2840
2843 2841 na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2844 2842 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2845 2843
2846 2844 /*
2847 2845 * If the dev_t will fit into 16 bits then compress
2848 2846 * it, otherwise leave it alone. See comments in
2849 2847 * nfs_client.c.
2850 2848 */
2851 2849 if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2852 2850 getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2853 2851 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2854 2852 else
2855 2853 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2856 2854
2857 2855 na->na_blocks = vap->va_nblocks;
2858 2856 na->na_blocksize = vap->va_blksize;
2859 2857
2860 2858 /*
2861 2859 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2862 2860 * over-the-wire protocols for named-pipe vnodes. It remaps the
2863 2861 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2864 2862 *
2865 2863 * BUYER BEWARE:
2866 2864 * If you are porting the NFS to a non-Sun server, you probably
2867 2865 * don't want to include the following block of code. The
2868 2866 * over-the-wire special file types will be changing with the
2869 2867 * NFS Protocol Revision.
2870 2868 */
2871 2869 if (vap->va_type == VFIFO)
2872 2870 NA_SETFIFO(na);
2873 2871 return (0);
2874 2872 }
2875 2873
2876 2874 /*
2877 2875 * acl v2 support: returns approximate permission.
2878 2876 * default: returns minimal permission (more restrictive)
2879 2877 * aclok: returns maximal permission (less restrictive)
2880 2878 * This routine changes the permissions that are alaredy in *va.
2881 2879 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2882 2880 * CLASS_OBJ is always the same as GROUP_OBJ entry.
2883 2881 */
2884 2882 static void
2885 2883 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2886 2884 {
2887 2885 vsecattr_t vsa;
2888 2886 int aclcnt;
2889 2887 aclent_t *aclentp;
2890 2888 mode_t mask_perm;
2891 2889 mode_t grp_perm;
2892 2890 mode_t other_perm;
2893 2891 mode_t other_orig;
2894 2892 int error;
2895 2893
2896 2894 /* dont care default acl */
2897 2895 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2898 2896 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2899 2897
2900 2898 if (!error) {
2901 2899 aclcnt = vsa.vsa_aclcnt;
2902 2900 if (aclcnt > MIN_ACL_ENTRIES) {
2903 2901 /* non-trivial ACL */
2904 2902 aclentp = vsa.vsa_aclentp;
2905 2903 if (exi->exi_export.ex_flags & EX_ACLOK) {
2906 2904 /* maximal permissions */
2907 2905 grp_perm = 0;
2908 2906 other_perm = 0;
2909 2907 for (; aclcnt > 0; aclcnt--, aclentp++) {
2910 2908 switch (aclentp->a_type) {
2911 2909 case USER_OBJ:
2912 2910 break;
2913 2911 case USER:
2914 2912 grp_perm |=
2915 2913 aclentp->a_perm << 3;
2916 2914 other_perm |= aclentp->a_perm;
2917 2915 break;
2918 2916 case GROUP_OBJ:
2919 2917 grp_perm |=
2920 2918 aclentp->a_perm << 3;
2921 2919 break;
2922 2920 case GROUP:
2923 2921 other_perm |= aclentp->a_perm;
2924 2922 break;
2925 2923 case OTHER_OBJ:
2926 2924 other_orig = aclentp->a_perm;
2927 2925 break;
2928 2926 case CLASS_OBJ:
2929 2927 mask_perm = aclentp->a_perm;
2930 2928 break;
2931 2929 default:
2932 2930 break;
2933 2931 }
2934 2932 }
2935 2933 grp_perm &= mask_perm << 3;
2936 2934 other_perm &= mask_perm;
2937 2935 other_perm |= other_orig;
2938 2936
2939 2937 } else {
2940 2938 /* minimal permissions */
2941 2939 grp_perm = 070;
2942 2940 other_perm = 07;
2943 2941 for (; aclcnt > 0; aclcnt--, aclentp++) {
2944 2942 switch (aclentp->a_type) {
2945 2943 case USER_OBJ:
2946 2944 break;
2947 2945 case USER:
2948 2946 case CLASS_OBJ:
2949 2947 grp_perm &=
2950 2948 aclentp->a_perm << 3;
2951 2949 other_perm &=
2952 2950 aclentp->a_perm;
2953 2951 break;
2954 2952 case GROUP_OBJ:
2955 2953 grp_perm &=
2956 2954 aclentp->a_perm << 3;
2957 2955 break;
2958 2956 case GROUP:
2959 2957 other_perm &=
2960 2958 aclentp->a_perm;
2961 2959 break;
2962 2960 case OTHER_OBJ:
2963 2961 other_perm &=
2964 2962 aclentp->a_perm;
2965 2963 break;
2966 2964 default:
2967 2965 break;
2968 2966 }
2969 2967 }
2970 2968 }
2971 2969 /* copy to va */
2972 2970 va->va_mode &= ~077;
2973 2971 va->va_mode |= grp_perm | other_perm;
2974 2972 }
2975 2973 if (vsa.vsa_aclcnt)
2976 2974 kmem_free(vsa.vsa_aclentp,
2977 2975 vsa.vsa_aclcnt * sizeof (aclent_t));
2978 2976 }
2979 2977 }
2980 2978
2981 2979 void
2982 2980 rfs_srvrinit(void)
2983 2981 {
2984 2982 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2985 2983 nfs2_srv_caller_id = fs_new_caller_id();
2986 2984 }
2987 2985
2988 2986 void
2989 2987 rfs_srvrfini(void)
2990 2988 {
2991 2989 mutex_destroy(&rfs_async_write_lock);
2992 2990 }
2993 2991
2994 2992 static int
2995 2993 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2996 2994 {
2997 2995 struct clist *wcl;
2998 2996 int wlist_len;
2999 2997 uint32_t count = rr->rr_count;
3000 2998
3001 2999 wcl = ra->ra_wlist;
3002 3000
3003 3001 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3004 3002 return (FALSE);
3005 3003 }
3006 3004
3007 3005 wcl = ra->ra_wlist;
3008 3006 rr->rr_ok.rrok_wlist_len = wlist_len;
3009 3007 rr->rr_ok.rrok_wlist = wcl;
3010 3008
3011 3009 return (TRUE);
3012 3010 }
↓ open down ↓ |
1783 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX