Print this page
5045 use atomic_{inc,dec}_* instead of atomic_add_*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/ufs/ufs_directio.c
+++ new/usr/src/uts/common/fs/ufs/ufs_directio.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 27 /* All Rights Reserved */
28 28
29 29 /*
30 30 * Portions of this source code were derived from Berkeley 4.3 BSD
31 31 * under license from the Regents of the University of California.
32 32 */
33 33
34 34 #include <sys/types.h>
35 35 #include <sys/t_lock.h>
36 36 #include <sys/param.h>
37 37 #include <sys/time.h>
38 38 #include <sys/systm.h>
39 39 #include <sys/sysmacros.h>
40 40 #include <sys/resource.h>
41 41 #include <sys/signal.h>
42 42 #include <sys/cred.h>
43 43 #include <sys/user.h>
44 44 #include <sys/buf.h>
45 45 #include <sys/vfs.h>
46 46 #include <sys/vnode.h>
47 47 #include <sys/proc.h>
48 48 #include <sys/disp.h>
49 49 #include <sys/file.h>
50 50 #include <sys/fcntl.h>
51 51 #include <sys/flock.h>
52 52 #include <sys/kmem.h>
53 53 #include <sys/uio.h>
54 54 #include <sys/dnlc.h>
55 55 #include <sys/conf.h>
56 56 #include <sys/mman.h>
57 57 #include <sys/pathname.h>
58 58 #include <sys/debug.h>
59 59 #include <sys/vmsystm.h>
60 60 #include <sys/cmn_err.h>
61 61 #include <sys/filio.h>
62 62 #include <sys/atomic.h>
63 63
64 64 #include <sys/fssnap_if.h>
65 65 #include <sys/fs/ufs_fs.h>
66 66 #include <sys/fs/ufs_lockfs.h>
67 67 #include <sys/fs/ufs_filio.h>
68 68 #include <sys/fs/ufs_inode.h>
69 69 #include <sys/fs/ufs_fsdir.h>
70 70 #include <sys/fs/ufs_quota.h>
71 71 #include <sys/fs/ufs_trans.h>
72 72 #include <sys/fs/ufs_panic.h>
73 73 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */
74 74 #include <sys/errno.h>
75 75
76 76 #include <sys/filio.h> /* _FIOIO */
77 77
78 78 #include <vm/hat.h>
79 79 #include <vm/page.h>
80 80 #include <vm/pvn.h>
81 81 #include <vm/as.h>
82 82 #include <vm/seg.h>
83 83 #include <vm/seg_map.h>
84 84 #include <vm/seg_vn.h>
85 85 #include <vm/seg_kmem.h>
86 86 #include <vm/rm.h>
87 87 #include <sys/swap.h>
88 88 #include <sys/epm.h>
89 89
90 90 #include <fs/fs_subr.h>
91 91
92 92 static void *ufs_directio_zero_buf;
93 93 static int ufs_directio_zero_len = 8192;
94 94
95 95 int ufs_directio_enabled = 1; /* feature is enabled */
96 96
97 97 /*
98 98 * for kstats reader
99 99 */
100 100 struct ufs_directio_kstats {
101 101 kstat_named_t logical_reads;
102 102 kstat_named_t phys_reads;
103 103 kstat_named_t hole_reads;
104 104 kstat_named_t nread;
105 105 kstat_named_t logical_writes;
106 106 kstat_named_t phys_writes;
107 107 kstat_named_t nwritten;
108 108 kstat_named_t nflushes;
109 109 } ufs_directio_kstats = {
110 110 { "logical_reads", KSTAT_DATA_UINT64 },
111 111 { "phys_reads", KSTAT_DATA_UINT64 },
112 112 { "hole_reads", KSTAT_DATA_UINT64 },
113 113 { "nread", KSTAT_DATA_UINT64 },
114 114 { "logical_writes", KSTAT_DATA_UINT64 },
115 115 { "phys_writes", KSTAT_DATA_UINT64 },
116 116 { "nwritten", KSTAT_DATA_UINT64 },
117 117 { "nflushes", KSTAT_DATA_UINT64 },
118 118 };
119 119
120 120 kstat_t *ufs_directio_kstatsp;
121 121
122 122 /*
123 123 * use kmem_cache_create for direct-physio buffers. This has shown
124 124 * a better cache distribution compared to buffers on the
125 125 * stack. It also avoids semaphore construction/deconstruction
126 126 * per request
127 127 */
128 128 struct directio_buf {
129 129 struct directio_buf *next;
130 130 char *addr;
131 131 size_t nbytes;
132 132 struct buf buf;
133 133 };
134 134 static struct kmem_cache *directio_buf_cache;
135 135
136 136
137 137 /* ARGSUSED */
138 138 static int
139 139 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
140 140 {
141 141 bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
142 142 return (0);
143 143 }
144 144
145 145 /* ARGSUSED */
146 146 static void
147 147 directio_buf_destructor(void *dbp, void *cdrarg)
148 148 {
149 149 biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
150 150 }
151 151
152 152 void
153 153 directio_bufs_init(void)
154 154 {
155 155 directio_buf_cache = kmem_cache_create("directio_buf_cache",
156 156 sizeof (struct directio_buf), 0,
157 157 directio_buf_constructor, directio_buf_destructor,
158 158 NULL, NULL, NULL, 0);
159 159 }
160 160
161 161 void
162 162 ufs_directio_init(void)
163 163 {
164 164 /*
165 165 * kstats
166 166 */
167 167 ufs_directio_kstatsp = kstat_create("ufs", 0,
168 168 "directio", "ufs", KSTAT_TYPE_NAMED,
169 169 sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
170 170 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
171 171 if (ufs_directio_kstatsp) {
172 172 ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
173 173 kstat_install(ufs_directio_kstatsp);
174 174 }
175 175 /*
176 176 * kzero is broken so we have to use a private buf of zeroes
177 177 */
178 178 ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
179 179 directio_bufs_init();
180 180 }
181 181
182 182 /*
183 183 * Wait for the first direct IO operation to finish
184 184 */
185 185 static int
186 186 directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
187 187 {
188 188 buf_t *bp;
189 189 int error;
190 190
191 191 /*
192 192 * Wait for IO to finish
193 193 */
194 194 bp = &dbp->buf;
195 195 error = biowait(bp);
196 196
197 197 /*
198 198 * bytes_io will be used to figure out a resid
199 199 * for the caller. The resid is approximated by reporting
200 200 * the bytes following the first failed IO as the residual.
201 201 *
202 202 * I am cautious about using b_resid because I
203 203 * am not sure how well the disk drivers maintain it.
204 204 */
205 205 if (error)
206 206 if (bp->b_resid)
207 207 *bytes_iop = bp->b_bcount - bp->b_resid;
208 208 else
209 209 *bytes_iop = 0;
210 210 else
211 211 *bytes_iop += bp->b_bcount;
212 212 /*
213 213 * Release direct IO resources
214 214 */
215 215 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
216 216 kmem_cache_free(directio_buf_cache, dbp);
217 217 return (error);
218 218 }
219 219
220 220 /*
221 221 * Wait for all of the direct IO operations to finish
222 222 */
223 223
224 224 uint32_t ufs_directio_drop_kpri = 0; /* enable kpri hack */
225 225
226 226 static int
227 227 directio_wait(struct directio_buf *tail, long *bytes_iop)
228 228 {
229 229 int error = 0, newerror;
230 230 struct directio_buf *dbp;
231 231 uint_t kpri_req_save;
232 232
233 233 /*
234 234 * The linked list of directio buf structures is maintained
235 235 * in reverse order (tail->last request->penultimate request->...)
236 236 */
237 237 /*
238 238 * This is the k_pri_req hack. Large numbers of threads
239 239 * sleeping with kernel priority will cause scheduler thrashing
240 240 * on an MP machine. This can be seen running Oracle using
241 241 * directio to ufs files. Sleep at normal priority here to
242 242 * more closely mimic physio to a device partition. This
243 243 * workaround is disabled by default as a niced thread could
244 244 * be starved from running while holding i_rwlock and i_contents.
245 245 */
246 246 if (ufs_directio_drop_kpri) {
247 247 kpri_req_save = curthread->t_kpri_req;
248 248 curthread->t_kpri_req = 0;
249 249 }
250 250 while ((dbp = tail) != NULL) {
251 251 tail = dbp->next;
252 252 newerror = directio_wait_one(dbp, bytes_iop);
253 253 if (error == 0)
254 254 error = newerror;
255 255 }
256 256 if (ufs_directio_drop_kpri)
257 257 curthread->t_kpri_req = kpri_req_save;
258 258 return (error);
259 259 }
260 260 /*
261 261 * Initiate direct IO request
262 262 */
263 263 static void
264 264 directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
265 265 offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
266 266 struct directio_buf **tailp, page_t **pplist)
267 267 {
268 268 buf_t *bp;
269 269 struct directio_buf *dbp;
270 270
271 271 /*
272 272 * Allocate a directio buf header
273 273 * Note - list is maintained in reverse order.
274 274 * directio_wait_one() depends on this fact when
275 275 * adjusting the ``bytes_io'' param. bytes_io
276 276 * is used to compute a residual in the case of error.
277 277 */
278 278 dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
279 279 dbp->next = *tailp;
280 280 *tailp = dbp;
281 281
282 282 /*
283 283 * Initialize buf header
284 284 */
285 285 dbp->addr = addr;
286 286 dbp->nbytes = nbytes;
287 287 bp = &dbp->buf;
288 288 bp->b_edev = ip->i_dev;
289 289 bp->b_lblkno = btodt(offset);
290 290 bp->b_bcount = nbytes;
291 291 bp->b_un.b_addr = addr;
292 292 bp->b_proc = procp;
293 293 bp->b_file = ip->i_vnode;
294 294
295 295 /*
296 296 * Note that S_WRITE implies B_READ and vice versa: a read(2)
297 297 * will B_READ data from the filesystem and S_WRITE it into
298 298 * the user's buffer; a write(2) will S_READ data from the
299 299 * user's buffer and B_WRITE it to the filesystem.
300 300 */
301 301 if (rw == S_WRITE) {
302 302 bp->b_flags = B_BUSY | B_PHYS | B_READ;
303 303 ufs_directio_kstats.phys_reads.value.ui64++;
304 304 ufs_directio_kstats.nread.value.ui64 += nbytes;
305 305 } else {
306 306 bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
307 307 ufs_directio_kstats.phys_writes.value.ui64++;
308 308 ufs_directio_kstats.nwritten.value.ui64 += nbytes;
309 309 }
310 310 bp->b_shadow = pplist;
311 311 if (pplist != NULL)
312 312 bp->b_flags |= B_SHADOW;
313 313
314 314 /*
315 315 * Issue I/O request.
316 316 */
317 317 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
318 318 if (ufsvfsp->vfs_snapshot)
319 319 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
320 320 else
321 321 (void) bdev_strategy(bp);
322 322
323 323 if (rw == S_WRITE)
324 324 lwp_stat_update(LWP_STAT_OUBLK, 1);
325 325 else
326 326 lwp_stat_update(LWP_STAT_INBLK, 1);
327 327
328 328 }
329 329
330 330 uint32_t ufs_shared_writes; /* writes done w/ lock shared */
331 331 uint32_t ufs_cur_writes; /* # concurrent writes */
332 332 uint32_t ufs_maxcur_writes; /* high water concurrent writes */
333 333 uint32_t ufs_posix_hits; /* writes done /w lock excl. */
334 334
335 335 /*
336 336 * Force POSIX syncronous data integrity on all writes for testing.
337 337 */
338 338 uint32_t ufs_force_posix_sdi = 0;
339 339
340 340 /*
341 341 * Direct Write
342 342 */
343 343
344 344 int
345 345 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
346 346 cred_t *cr, int *statusp)
347 347 {
348 348 long resid, bytes_written;
349 349 u_offset_t size, uoff;
350 350 uio_t *uio = arg_uio;
351 351 rlim64_t limit = uio->uio_llimit;
352 352 int on, n, error, newerror, len, has_holes;
353 353 daddr_t bn;
354 354 size_t nbytes;
355 355 struct fs *fs;
356 356 vnode_t *vp;
357 357 iovec_t *iov;
358 358 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
359 359 struct proc *procp;
360 360 struct as *as;
361 361 struct directio_buf *tail;
362 362 int exclusive, ncur, bmap_peek;
363 363 uio_t copy_uio;
364 364 iovec_t copy_iov;
365 365 char *copy_base;
366 366 long copy_resid;
367 367
368 368 /*
369 369 * assume that directio isn't possible (normal case)
370 370 */
371 371 *statusp = DIRECTIO_FAILURE;
372 372
373 373 /*
374 374 * Don't go direct
375 375 */
376 376 if (ufs_directio_enabled == 0)
377 377 return (0);
378 378
379 379 /*
380 380 * mapped file; nevermind
381 381 */
382 382 if (ip->i_mapcnt)
383 383 return (0);
384 384
385 385 /*
386 386 * CAN WE DO DIRECT IO?
387 387 */
388 388 uoff = uio->uio_loffset;
389 389 resid = uio->uio_resid;
390 390
391 391 /*
392 392 * beyond limit
393 393 */
394 394 if (uoff + resid > limit)
395 395 return (0);
396 396
397 397 /*
398 398 * must be sector aligned
399 399 */
400 400 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
401 401 return (0);
402 402
403 403 /*
404 404 * SHOULD WE DO DIRECT IO?
405 405 */
406 406 size = ip->i_size;
407 407 has_holes = -1;
408 408
409 409 /*
410 410 * only on regular files; no metadata
411 411 */
412 412 if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
413 413 return (0);
414 414
415 415 /*
416 416 * Synchronous, allocating writes run very slow in Direct-Mode
417 417 * XXX - can be fixed with bmap_write changes for large writes!!!
418 418 * XXX - can be fixed for updates to "almost-full" files
419 419 * XXX - WARNING - system hangs if bmap_write() has to
420 420 * allocate lots of pages since pageout
421 421 * suspends on locked inode
422 422 */
423 423 if (!rewrite && (ip->i_flag & ISYNC)) {
424 424 if ((uoff + resid) > size)
425 425 return (0);
426 426 has_holes = bmap_has_holes(ip);
427 427 if (has_holes)
428 428 return (0);
429 429 }
430 430
431 431 /*
432 432 * Each iovec must be short aligned and sector aligned. If
433 433 * one is not, then kmem_alloc a new buffer and copy all of
434 434 * the smaller buffers into the new buffer. This new
435 435 * buffer will be short aligned and sector aligned.
436 436 */
437 437 iov = uio->uio_iov;
438 438 nbytes = uio->uio_iovcnt;
439 439 while (nbytes--) {
440 440 if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
441 441 (intptr_t)(iov->iov_base) & 1) {
442 442 copy_resid = uio->uio_resid;
443 443 copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
444 444 if (copy_base == NULL)
445 445 return (0);
446 446 copy_iov.iov_base = copy_base;
447 447 copy_iov.iov_len = copy_resid;
448 448 copy_uio.uio_iov = ©_iov;
449 449 copy_uio.uio_iovcnt = 1;
450 450 copy_uio.uio_segflg = UIO_SYSSPACE;
451 451 copy_uio.uio_extflg = UIO_COPY_DEFAULT;
452 452 copy_uio.uio_loffset = uio->uio_loffset;
453 453 copy_uio.uio_resid = uio->uio_resid;
454 454 copy_uio.uio_llimit = uio->uio_llimit;
455 455 error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
456 456 if (error) {
457 457 kmem_free(copy_base, copy_resid);
458 458 return (0);
459 459 }
460 460 uio = ©_uio;
461 461 break;
462 462 }
463 463 iov++;
464 464 }
465 465
466 466 /*
467 467 * From here on down, all error exits must go to errout and
468 468 * not simply return a 0.
469 469 */
470 470
471 471 /*
472 472 * DIRECTIO
473 473 */
474 474
475 475 fs = ip->i_fs;
476 476
477 477 /*
478 478 * POSIX check. If attempting a concurrent re-write, make sure
479 479 * that this will be a single request to the driver to meet
480 480 * POSIX synchronous data integrity requirements.
481 481 */
482 482 bmap_peek = 0;
483 483 if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
484 484 int upgrade = 0;
485 485
486 486 /* check easy conditions first */
487 487 if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
488 488 upgrade = 1;
489 489 } else {
490 490 /* now look for contiguous allocation */
491 491 len = (ssize_t)blkroundup(fs, resid);
492 492 error = bmap_read(ip, uoff, &bn, &len);
493 493 if (error || bn == UFS_HOLE || len == 0)
494 494 goto errout;
495 495 /* save a call to bmap_read later */
496 496 bmap_peek = 1;
497 497 if (len < resid)
498 498 upgrade = 1;
499 499 }
500 500 if (upgrade) {
501 501 rw_exit(&ip->i_contents);
502 502 rw_enter(&ip->i_contents, RW_WRITER);
503 503 ufs_posix_hits++;
504 504 }
505 505 }
506 506
507 507
508 508 /*
509 509 * allocate space
510 510 */
511 511
512 512 /*
513 513 * If attempting a re-write, there is no allocation to do.
514 514 * bmap_write would trip an ASSERT if i_contents is held shared.
515 515 */
516 516 if (rewrite)
517 517 goto skip_alloc;
518 518
519 519 do {
520 520 on = (int)blkoff(fs, uoff);
521 521 n = (int)MIN(fs->fs_bsize - on, resid);
522 522 if ((uoff + n) > ip->i_size) {
523 523 error = bmap_write(ip, uoff, (int)(on + n),
524 524 (int)(uoff & (offset_t)MAXBOFFSET) == 0,
525 525 NULL, cr);
526 526 /* Caller is responsible for updating i_seq if needed */
527 527 if (error)
528 528 break;
529 529 ip->i_size = uoff + n;
530 530 ip->i_flag |= IATTCHG;
531 531 } else if (n == MAXBSIZE) {
532 532 error = bmap_write(ip, uoff, (int)(on + n),
533 533 BI_ALLOC_ONLY, NULL, cr);
534 534 /* Caller is responsible for updating i_seq if needed */
535 535 } else {
536 536 if (has_holes < 0)
537 537 has_holes = bmap_has_holes(ip);
538 538 if (has_holes) {
539 539 uint_t blk_size;
540 540 u_offset_t offset;
541 541
542 542 offset = uoff & (offset_t)fs->fs_bmask;
543 543 blk_size = (int)blksize(fs, ip,
544 544 (daddr_t)lblkno(fs, offset));
545 545 error = bmap_write(ip, uoff, blk_size,
546 546 BI_NORMAL, NULL, cr);
547 547 /*
548 548 * Caller is responsible for updating
549 549 * i_seq if needed
550 550 */
551 551 } else
552 552 error = 0;
553 553 }
554 554 if (error)
555 555 break;
556 556 uoff += n;
557 557 resid -= n;
558 558 /*
559 559 * if file has grown larger than 2GB, set flag
560 560 * in superblock if not already set
561 561 */
562 562 if ((ip->i_size > MAXOFF32_T) &&
563 563 !(fs->fs_flags & FSLARGEFILES)) {
564 564 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
565 565 mutex_enter(&ufsvfsp->vfs_lock);
566 566 fs->fs_flags |= FSLARGEFILES;
567 567 ufs_sbwrite(ufsvfsp);
568 568 mutex_exit(&ufsvfsp->vfs_lock);
569 569 }
570 570 } while (resid);
571 571
572 572 if (error) {
573 573 /*
574 574 * restore original state
575 575 */
576 576 if (resid) {
577 577 if (size == ip->i_size)
578 578 goto errout;
579 579 (void) ufs_itrunc(ip, size, 0, cr);
580 580 }
581 581 /*
582 582 * try non-directio path
583 583 */
584 584 goto errout;
585 585 }
586 586 skip_alloc:
587 587
588 588 /*
589 589 * get rid of cached pages
590 590 */
591 591 vp = ITOV(ip);
592 592 exclusive = rw_write_held(&ip->i_contents);
593 593 if (vn_has_cached_data(vp)) {
594 594 if (!exclusive) {
595 595 /*
596 596 * Still holding i_rwlock, so no allocations
597 597 * can happen after dropping contents.
598 598 */
599 599 rw_exit(&ip->i_contents);
600 600 rw_enter(&ip->i_contents, RW_WRITER);
601 601 }
602 602 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
603 603 B_INVAL, cr, NULL);
604 604 if (vn_has_cached_data(vp))
605 605 goto errout;
606 606 if (!exclusive)
↓ open down ↓ |
606 lines elided |
↑ open up ↑ |
607 607 rw_downgrade(&ip->i_contents);
608 608 ufs_directio_kstats.nflushes.value.ui64++;
609 609 }
610 610
611 611 /*
612 612 * Direct Writes
613 613 */
614 614
615 615 if (!exclusive) {
616 616 ufs_shared_writes++;
617 - ncur = atomic_add_32_nv(&ufs_cur_writes, 1);
617 + ncur = atomic_inc_32_nv(&ufs_cur_writes);
618 618 if (ncur > ufs_maxcur_writes)
619 619 ufs_maxcur_writes = ncur;
620 620 }
621 621
622 622 /*
623 623 * proc and as are for VM operations in directio_start()
624 624 */
625 625 if (uio->uio_segflg == UIO_USERSPACE) {
626 626 procp = ttoproc(curthread);
627 627 as = procp->p_as;
628 628 } else {
629 629 procp = NULL;
630 630 as = &kas;
631 631 }
632 632 *statusp = DIRECTIO_SUCCESS;
633 633 error = 0;
634 634 newerror = 0;
635 635 resid = uio->uio_resid;
636 636 bytes_written = 0;
637 637 ufs_directio_kstats.logical_writes.value.ui64++;
638 638 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
639 639 size_t pglck_len, pglck_size;
640 640 caddr_t pglck_base;
641 641 page_t **pplist, **spplist;
642 642
643 643 tail = NULL;
644 644
645 645 /*
646 646 * Adjust number of bytes
647 647 */
648 648 iov = uio->uio_iov;
649 649 pglck_len = (size_t)MIN(iov->iov_len, resid);
650 650 pglck_base = iov->iov_base;
651 651 if (pglck_len == 0) {
652 652 uio->uio_iov++;
653 653 uio->uio_iovcnt--;
654 654 continue;
655 655 }
656 656
657 657 /*
658 658 * Try to Lock down the largest chunck of pages possible.
659 659 */
660 660 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz);
661 661 error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
662 662
663 663 if (error)
664 664 break;
665 665
666 666 pglck_size = pglck_len;
667 667 while (pglck_len) {
668 668
669 669 nbytes = pglck_len;
670 670 uoff = uio->uio_loffset;
671 671
672 672 if (!bmap_peek) {
673 673
674 674 /*
675 675 * Re-adjust number of bytes to contiguous
676 676 * range. May have already called bmap_read
677 677 * in the case of a concurrent rewrite.
678 678 */
679 679 len = (ssize_t)blkroundup(fs, nbytes);
680 680 error = bmap_read(ip, uoff, &bn, &len);
681 681 if (error)
682 682 break;
683 683 if (bn == UFS_HOLE || len == 0)
684 684 break;
685 685 }
686 686 nbytes = (size_t)MIN(nbytes, len);
687 687 bmap_peek = 0;
688 688
689 689 /*
690 690 * Get the pagelist pointer for this offset to be
691 691 * passed to directio_start.
692 692 */
693 693
694 694 if (pplist != NULL)
695 695 spplist = pplist +
696 696 btop((uintptr_t)iov->iov_base -
697 697 ((uintptr_t)pglck_base & PAGEMASK));
698 698 else
699 699 spplist = NULL;
700 700
701 701 /*
702 702 * Kick off the direct write requests
703 703 */
704 704 directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
705 705 iov->iov_base, S_READ, procp, &tail, spplist);
706 706
707 707 /*
708 708 * Adjust pointers and counters
709 709 */
710 710 iov->iov_len -= nbytes;
711 711 iov->iov_base += nbytes;
712 712 uio->uio_loffset += nbytes;
713 713 resid -= nbytes;
714 714 pglck_len -= nbytes;
715 715 }
716 716
717 717 /*
718 718 * Wait for outstanding requests
719 719 */
↓ open down ↓ |
92 lines elided |
↑ open up ↑ |
720 720 newerror = directio_wait(tail, &bytes_written);
721 721
722 722 /*
723 723 * Release VM resources
724 724 */
725 725 as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
726 726
727 727 }
728 728
729 729 if (!exclusive) {
730 - atomic_add_32(&ufs_cur_writes, -1);
730 + atomic_dec_32(&ufs_cur_writes);
731 731 /*
732 732 * If this write was done shared, readers may
733 733 * have pulled in unmodified pages. Get rid of
734 734 * these potentially stale pages.
735 735 */
736 736 if (vn_has_cached_data(vp)) {
737 737 rw_exit(&ip->i_contents);
738 738 rw_enter(&ip->i_contents, RW_WRITER);
739 739 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
740 740 B_INVAL, cr, NULL);
741 741 ufs_directio_kstats.nflushes.value.ui64++;
742 742 rw_downgrade(&ip->i_contents);
743 743 }
744 744 }
745 745
746 746 /*
747 747 * If error, adjust resid to begin at the first
748 748 * un-writable byte.
749 749 */
750 750 if (error == 0)
751 751 error = newerror;
752 752 if (error)
753 753 resid = uio->uio_resid - bytes_written;
754 754 arg_uio->uio_resid = resid;
755 755
756 756 if (!rewrite) {
757 757 ip->i_flag |= IUPD | ICHG;
758 758 /* Caller will update i_seq */
759 759 TRANS_INODE(ip->i_ufsvfs, ip);
760 760 }
761 761 /*
762 762 * If there is a residual; adjust the EOF if necessary
763 763 */
764 764 if (resid) {
765 765 if (size != ip->i_size) {
766 766 if (uio->uio_loffset > size)
767 767 size = uio->uio_loffset;
768 768 (void) ufs_itrunc(ip, size, 0, cr);
769 769 }
770 770 }
771 771
772 772 if (uio == ©_uio)
773 773 kmem_free(copy_base, copy_resid);
774 774
775 775 return (error);
776 776
777 777 errout:
778 778 if (uio == ©_uio)
779 779 kmem_free(copy_base, copy_resid);
780 780
781 781 return (0);
782 782 }
783 783 /*
784 784 * Direct read of a hole
785 785 */
786 786 static int
787 787 directio_hole(struct uio *uio, size_t nbytes)
788 788 {
789 789 int error = 0, nzero;
790 790 uio_t phys_uio;
791 791 iovec_t phys_iov;
792 792
793 793 ufs_directio_kstats.hole_reads.value.ui64++;
794 794 ufs_directio_kstats.nread.value.ui64 += nbytes;
795 795
796 796 phys_iov.iov_base = uio->uio_iov->iov_base;
797 797 phys_iov.iov_len = nbytes;
798 798
799 799 phys_uio.uio_iov = &phys_iov;
800 800 phys_uio.uio_iovcnt = 1;
801 801 phys_uio.uio_resid = phys_iov.iov_len;
802 802 phys_uio.uio_segflg = uio->uio_segflg;
803 803 phys_uio.uio_extflg = uio->uio_extflg;
804 804 while (error == 0 && phys_uio.uio_resid) {
805 805 nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
806 806 error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
807 807 &phys_uio);
808 808 }
809 809 return (error);
810 810 }
811 811
812 812 /*
813 813 * Direct Read
814 814 */
815 815 int
816 816 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
817 817 {
818 818 ssize_t resid, bytes_read;
819 819 u_offset_t size, uoff;
820 820 int error, newerror, len;
821 821 size_t nbytes;
822 822 struct fs *fs;
823 823 vnode_t *vp;
824 824 daddr_t bn;
825 825 iovec_t *iov;
826 826 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
827 827 struct proc *procp;
828 828 struct as *as;
829 829 struct directio_buf *tail;
830 830
831 831 /*
832 832 * assume that directio isn't possible (normal case)
833 833 */
834 834 *statusp = DIRECTIO_FAILURE;
835 835
836 836 /*
837 837 * Don't go direct
838 838 */
839 839 if (ufs_directio_enabled == 0)
840 840 return (0);
841 841
842 842 /*
843 843 * mapped file; nevermind
844 844 */
845 845 if (ip->i_mapcnt)
846 846 return (0);
847 847
848 848 /*
849 849 * CAN WE DO DIRECT IO?
850 850 */
851 851 /*
852 852 * must be sector aligned
853 853 */
854 854 uoff = uio->uio_loffset;
855 855 resid = uio->uio_resid;
856 856 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
857 857 return (0);
858 858 /*
859 859 * must be short aligned and sector aligned
860 860 */
861 861 iov = uio->uio_iov;
862 862 nbytes = uio->uio_iovcnt;
863 863 while (nbytes--) {
864 864 if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
865 865 return (0);
866 866 if ((intptr_t)(iov++->iov_base) & 1)
867 867 return (0);
868 868 }
869 869
870 870 /*
871 871 * DIRECTIO
872 872 */
873 873 fs = ip->i_fs;
874 874
875 875 /*
876 876 * don't read past EOF
877 877 */
878 878 size = ip->i_size;
879 879
880 880 /*
881 881 * The file offset is past EOF so bail out here; we don't want
882 882 * to update uio_resid and make it look like we read something.
883 883 * We say that direct I/O was a success to avoid having rdip()
884 884 * go through the same "read past EOF logic".
885 885 */
886 886 if (uoff >= size) {
887 887 *statusp = DIRECTIO_SUCCESS;
888 888 return (0);
889 889 }
890 890
891 891 /*
892 892 * The read would extend past EOF so make it smaller.
893 893 */
894 894 if ((uoff + resid) > size) {
895 895 resid = size - uoff;
896 896 /*
897 897 * recheck sector alignment
898 898 */
899 899 if (resid & (DEV_BSIZE - 1))
900 900 return (0);
901 901 }
902 902
903 903 /*
904 904 * At this point, we know there is some real work to do.
905 905 */
906 906 ASSERT(resid);
907 907
908 908 /*
909 909 * get rid of cached pages
910 910 */
911 911 vp = ITOV(ip);
912 912 if (vn_has_cached_data(vp)) {
913 913 rw_exit(&ip->i_contents);
914 914 rw_enter(&ip->i_contents, RW_WRITER);
915 915 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
916 916 B_INVAL, cr, NULL);
917 917 if (vn_has_cached_data(vp))
918 918 return (0);
919 919 rw_downgrade(&ip->i_contents);
920 920 ufs_directio_kstats.nflushes.value.ui64++;
921 921 }
922 922 /*
923 923 * Direct Reads
924 924 */
925 925
926 926 /*
927 927 * proc and as are for VM operations in directio_start()
928 928 */
929 929 if (uio->uio_segflg == UIO_USERSPACE) {
930 930 procp = ttoproc(curthread);
931 931 as = procp->p_as;
932 932 } else {
933 933 procp = NULL;
934 934 as = &kas;
935 935 }
936 936
937 937 *statusp = DIRECTIO_SUCCESS;
938 938 error = 0;
939 939 newerror = 0;
940 940 bytes_read = 0;
941 941 ufs_directio_kstats.logical_reads.value.ui64++;
942 942 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
943 943 size_t pglck_len, pglck_size;
944 944 caddr_t pglck_base;
945 945 page_t **pplist, **spplist;
946 946
947 947 tail = NULL;
948 948
949 949 /*
950 950 * Adjust number of bytes
951 951 */
952 952 iov = uio->uio_iov;
953 953 pglck_len = (size_t)MIN(iov->iov_len, resid);
954 954 pglck_base = iov->iov_base;
955 955 if (pglck_len == 0) {
956 956 uio->uio_iov++;
957 957 uio->uio_iovcnt--;
958 958 continue;
959 959 }
960 960
961 961 /*
962 962 * Try to Lock down the largest chunck of pages possible.
963 963 */
964 964 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz);
965 965 error = as_pagelock(as, &pplist, pglck_base,
966 966 pglck_len, S_WRITE);
967 967
968 968 if (error)
969 969 break;
970 970
971 971 pglck_size = pglck_len;
972 972 while (pglck_len) {
973 973
974 974 nbytes = pglck_len;
975 975 uoff = uio->uio_loffset;
976 976
977 977 /*
978 978 * Re-adjust number of bytes to contiguous range
979 979 */
980 980 len = (ssize_t)blkroundup(fs, nbytes);
981 981 error = bmap_read(ip, uoff, &bn, &len);
982 982 if (error)
983 983 break;
984 984
985 985 if (bn == UFS_HOLE) {
986 986 nbytes = (size_t)MIN(fs->fs_bsize -
987 987 (long)blkoff(fs, uoff), nbytes);
988 988 error = directio_hole(uio, nbytes);
989 989 /*
990 990 * Hole reads are not added to the list
991 991 * processed by directio_wait() below so
992 992 * account for bytes read here.
993 993 */
994 994 if (!error)
995 995 bytes_read += nbytes;
996 996 } else {
997 997 nbytes = (size_t)MIN(nbytes, len);
998 998
999 999 /*
1000 1000 * Get the pagelist pointer for this offset
1001 1001 * to be passed to directio_start.
1002 1002 */
1003 1003 if (pplist != NULL)
1004 1004 spplist = pplist +
1005 1005 btop((uintptr_t)iov->iov_base -
1006 1006 ((uintptr_t)pglck_base & PAGEMASK));
1007 1007 else
1008 1008 spplist = NULL;
1009 1009
1010 1010 /*
1011 1011 * Kick off the direct read requests
1012 1012 */
1013 1013 directio_start(ufsvfsp, ip, nbytes,
1014 1014 ldbtob(bn), iov->iov_base,
1015 1015 S_WRITE, procp, &tail, spplist);
1016 1016 }
1017 1017
1018 1018 if (error)
1019 1019 break;
1020 1020
1021 1021 /*
1022 1022 * Adjust pointers and counters
1023 1023 */
1024 1024 iov->iov_len -= nbytes;
1025 1025 iov->iov_base += nbytes;
1026 1026 uio->uio_loffset += nbytes;
1027 1027 resid -= nbytes;
1028 1028 pglck_len -= nbytes;
1029 1029 }
1030 1030
1031 1031 /*
1032 1032 * Wait for outstanding requests
1033 1033 */
1034 1034 newerror = directio_wait(tail, &bytes_read);
1035 1035 /*
1036 1036 * Release VM resources
1037 1037 */
1038 1038 as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1039 1039
1040 1040 }
1041 1041
1042 1042 /*
1043 1043 * If error, adjust resid to begin at the first
1044 1044 * un-read byte.
1045 1045 */
1046 1046 if (error == 0)
1047 1047 error = newerror;
1048 1048 uio->uio_resid -= bytes_read;
1049 1049 return (error);
1050 1050 }
↓ open down ↓ |
310 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX