Print this page
5045 use atomic_{inc,dec}_* instead of atomic_add_*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/os/bio.c
+++ new/usr/src/uts/common/os/bio.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 27 /* All Rights Reserved */
28 28
29 29 /*
30 30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 31 * The Regents of the University of California
32 32 * All Rights Reserved
33 33 *
34 34 * University Acknowledgment- Portions of this document are derived from
35 35 * software developed by the University of California, Berkeley, and its
36 36 * contributors.
37 37 */
38 38
39 39 #include <sys/types.h>
40 40 #include <sys/t_lock.h>
41 41 #include <sys/sysmacros.h>
42 42 #include <sys/conf.h>
43 43 #include <sys/cpuvar.h>
44 44 #include <sys/errno.h>
45 45 #include <sys/debug.h>
46 46 #include <sys/buf.h>
47 47 #include <sys/var.h>
48 48 #include <sys/vnode.h>
49 49 #include <sys/bitmap.h>
50 50 #include <sys/cmn_err.h>
51 51 #include <sys/kmem.h>
52 52 #include <sys/vmem.h>
53 53 #include <sys/atomic.h>
54 54 #include <vm/seg_kmem.h>
55 55 #include <vm/page.h>
56 56 #include <vm/pvn.h>
57 57 #include <sys/vtrace.h>
58 58 #include <sys/tnf_probe.h>
59 59 #include <sys/fs/ufs_inode.h>
60 60 #include <sys/fs/ufs_bio.h>
61 61 #include <sys/fs/ufs_log.h>
62 62 #include <sys/systm.h>
63 63 #include <sys/vfs.h>
64 64 #include <sys/sdt.h>
65 65
66 66 /* Locks */
67 67 static kmutex_t blist_lock; /* protects b_list */
68 68 static kmutex_t bhdr_lock; /* protects the bhdrlist */
69 69 static kmutex_t bfree_lock; /* protects the bfreelist structure */
70 70
71 71 struct hbuf *hbuf; /* Hash buckets */
72 72 struct dwbuf *dwbuf; /* Delayed write buckets */
73 73 static struct buf *bhdrlist; /* buf header free list */
74 74 static int nbuf; /* number of buffer headers allocated */
75 75
76 76 static int lastindex; /* Reference point on where to start */
77 77 /* when looking for free buffers */
78 78
79 79 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
80 80 #define EMPTY_LIST ((struct buf *)-1)
81 81
82 82 static kcondvar_t bio_mem_cv; /* Condition variables */
83 83 static kcondvar_t bio_flushinval_cv;
84 84 static int bio_doingflush; /* flush in progress */
85 85 static int bio_doinginval; /* inval in progress */
86 86 static int bio_flinv_cv_wanted; /* someone waiting for cv */
87 87
88 88 /*
89 89 * Statistics on the buffer cache
90 90 */
91 91 struct biostats biostats = {
92 92 { "buffer_cache_lookups", KSTAT_DATA_UINT32 },
93 93 { "buffer_cache_hits", KSTAT_DATA_UINT32 },
94 94 { "new_buffer_requests", KSTAT_DATA_UINT32 },
95 95 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 },
96 96 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 },
97 97 { "duplicate_buffers_found", KSTAT_DATA_UINT32 }
98 98 };
99 99
100 100 /*
101 101 * kstat data
102 102 */
103 103 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats;
104 104 uint_t biostats_ndata = (uint_t)(sizeof (biostats) /
105 105 sizeof (kstat_named_t));
106 106
107 107 /*
108 108 * Statistics on ufs buffer cache
109 109 * Not protected by locks
110 110 */
111 111 struct ufsbiostats ub = {
112 112 { "breads", KSTAT_DATA_UINT32 },
113 113 { "bwrites", KSTAT_DATA_UINT32 },
114 114 { "fbiwrites", KSTAT_DATA_UINT32 },
115 115 { "getpages", KSTAT_DATA_UINT32 },
116 116 { "getras", KSTAT_DATA_UINT32 },
117 117 { "putsyncs", KSTAT_DATA_UINT32 },
118 118 { "putasyncs", KSTAT_DATA_UINT32 },
119 119 { "putpageios", KSTAT_DATA_UINT32 },
120 120 };
121 121
122 122 /*
123 123 * more UFS Logging eccentricities...
124 124 *
125 125 * required since "#pragma weak ..." doesn't work in reverse order.
126 126 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
127 127 * to ufs routines don't get plugged into bio.c calls so
128 128 * we initialize it when setting up the "lufsops" table
129 129 * in "lufs.c:_init()"
130 130 */
131 131 void (*bio_lufs_strategy)(void *, buf_t *);
132 132 void (*bio_snapshot_strategy)(void *, buf_t *);
133 133
134 134
135 135 /* Private routines */
136 136 static struct buf *bio_getfreeblk(long);
137 137 static void bio_mem_get(long);
138 138 static void bio_bhdr_free(struct buf *);
139 139 static struct buf *bio_bhdr_alloc(void);
140 140 static void bio_recycle(int, long);
141 141 static void bio_pageio_done(struct buf *);
142 142 static int bio_incore(dev_t, daddr_t);
143 143
144 144 /*
145 145 * Buffer cache constants
146 146 */
147 147 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
148 148 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
149 149 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
150 150 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
151 151 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
152 152 #define BIO_HASHLEN 4 /* Target length of hash chains */
153 153
154 154
155 155 /* Flags for bio_recycle() */
156 156 #define BIO_HEADER 0x01
157 157 #define BIO_MEM 0x02
158 158
159 159 extern int bufhwm; /* User tunable - high water mark for mem */
160 160 extern int bufhwm_pct; /* ditto - given in % of physmem */
161 161
162 162 /*
163 163 * The following routines allocate and free
164 164 * buffers with various side effects. In general the
165 165 * arguments to an allocate routine are a device and
166 166 * a block number, and the value is a pointer to
167 167 * to the buffer header; the buffer returned is locked with a
168 168 * binary semaphore so that no one else can touch it. If the block was
169 169 * already in core, no I/O need be done; if it is
170 170 * already locked, the process waits until it becomes free.
171 171 * The following routines allocate a buffer:
172 172 * getblk
173 173 * bread/BREAD
174 174 * breada
175 175 * Eventually the buffer must be released, possibly with the
176 176 * side effect of writing it out, by using one of
177 177 * bwrite/BWRITE/brwrite
178 178 * bdwrite/bdrwrite
179 179 * bawrite
180 180 * brelse
181 181 *
182 182 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
183 183 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
184 184 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
185 185 * B_DONE is still used to denote a buffer with I/O complete on it.
186 186 *
187 187 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
188 188 * should not be used where a very accurate count of the free buffers is
189 189 * needed.
190 190 */
191 191
192 192 /*
193 193 * Read in (if necessary) the block and return a buffer pointer.
194 194 *
195 195 * This interface is provided for binary compatibility. Using
196 196 * BREAD() directly avoids the extra function call overhead invoked
197 197 * by calling this routine.
198 198 */
199 199 struct buf *
200 200 bread(dev_t dev, daddr_t blkno, long bsize)
201 201 {
202 202 return (BREAD(dev, blkno, bsize));
203 203 }
204 204
205 205 /*
206 206 * Common code for reading a buffer with various options
207 207 *
208 208 * Read in (if necessary) the block and return a buffer pointer.
209 209 */
210 210 struct buf *
211 211 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
212 212 {
213 213 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
214 214 struct buf *bp;
215 215 klwp_t *lwp = ttolwp(curthread);
216 216
217 217 CPU_STATS_ADD_K(sys, lread, 1);
218 218 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
219 219 if (bp->b_flags & B_DONE)
220 220 return (bp);
221 221 bp->b_flags |= B_READ;
222 222 ASSERT(bp->b_bcount == bsize);
223 223 if (ufsvfsp == NULL) { /* !ufs */
224 224 (void) bdev_strategy(bp);
225 225 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
226 226 /* ufs && logging */
227 227 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
228 228 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
229 229 /* ufs && snapshots */
230 230 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
231 231 } else {
232 232 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
233 233 ub.ub_breads.value.ul++; /* ufs && !logging */
234 234 (void) bdev_strategy(bp);
235 235 }
236 236 if (lwp != NULL)
237 237 lwp->lwp_ru.inblock++;
238 238 CPU_STATS_ADD_K(sys, bread, 1);
239 239 (void) biowait(bp);
240 240 return (bp);
241 241 }
242 242
243 243 /*
244 244 * Read in the block, like bread, but also start I/O on the
245 245 * read-ahead block (which is not allocated to the caller).
246 246 */
247 247 struct buf *
248 248 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
249 249 {
250 250 struct buf *bp, *rabp;
251 251 klwp_t *lwp = ttolwp(curthread);
252 252
253 253 bp = NULL;
254 254 if (!bio_incore(dev, blkno)) {
255 255 CPU_STATS_ADD_K(sys, lread, 1);
256 256 bp = GETBLK(dev, blkno, bsize);
257 257 if ((bp->b_flags & B_DONE) == 0) {
258 258 bp->b_flags |= B_READ;
259 259 bp->b_bcount = bsize;
260 260 (void) bdev_strategy(bp);
261 261 if (lwp != NULL)
262 262 lwp->lwp_ru.inblock++;
263 263 CPU_STATS_ADD_K(sys, bread, 1);
264 264 }
265 265 }
266 266 if (rablkno && bfreelist.b_bcount > 1 &&
267 267 !bio_incore(dev, rablkno)) {
268 268 rabp = GETBLK(dev, rablkno, bsize);
269 269 if (rabp->b_flags & B_DONE)
270 270 brelse(rabp);
271 271 else {
272 272 rabp->b_flags |= B_READ|B_ASYNC;
273 273 rabp->b_bcount = bsize;
274 274 (void) bdev_strategy(rabp);
275 275 if (lwp != NULL)
276 276 lwp->lwp_ru.inblock++;
277 277 CPU_STATS_ADD_K(sys, bread, 1);
278 278 }
279 279 }
280 280 if (bp == NULL)
281 281 return (BREAD(dev, blkno, bsize));
282 282 (void) biowait(bp);
283 283 return (bp);
284 284 }
285 285
286 286 /*
287 287 * Common code for writing a buffer with various options.
288 288 *
289 289 * force_wait - wait for write completion regardless of B_ASYNC flag
290 290 * do_relse - release the buffer when we are done
291 291 * clear_flags - flags to clear from the buffer
292 292 */
293 293 void
294 294 bwrite_common(void *arg, struct buf *bp, int force_wait,
295 295 int do_relse, int clear_flags)
296 296 {
297 297 register int do_wait;
298 298 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
299 299 int flag;
300 300 klwp_t *lwp = ttolwp(curthread);
301 301 struct cpu *cpup;
302 302
303 303 ASSERT(SEMA_HELD(&bp->b_sem));
304 304 flag = bp->b_flags;
305 305 bp->b_flags &= ~clear_flags;
306 306 if (lwp != NULL)
307 307 lwp->lwp_ru.oublock++;
308 308 CPU_STATS_ENTER_K();
309 309 cpup = CPU; /* get pointer AFTER preemption is disabled */
310 310 CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
311 311 CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
312 312 do_wait = ((flag & B_ASYNC) == 0 || force_wait);
313 313 if (do_wait == 0)
314 314 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
315 315 CPU_STATS_EXIT_K();
316 316 if (ufsvfsp == NULL) {
317 317 (void) bdev_strategy(bp);
318 318 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
319 319 /* ufs && logging */
320 320 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
321 321 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
322 322 /* ufs && snapshots */
323 323 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
324 324 } else {
325 325 ub.ub_bwrites.value.ul++; /* ufs && !logging */
326 326 (void) bdev_strategy(bp);
327 327 }
328 328 if (do_wait) {
329 329 (void) biowait(bp);
330 330 if (do_relse) {
331 331 brelse(bp);
332 332 }
333 333 }
334 334 }
335 335
336 336 /*
337 337 * Write the buffer, waiting for completion (unless B_ASYNC is set).
338 338 * Then release the buffer.
339 339 * This interface is provided for binary compatibility. Using
340 340 * BWRITE() directly avoids the extra function call overhead invoked
341 341 * by calling this routine.
342 342 */
343 343 void
344 344 bwrite(struct buf *bp)
345 345 {
346 346 BWRITE(bp);
347 347 }
348 348
349 349 /*
350 350 * Write the buffer, waiting for completion.
351 351 * But don't release the buffer afterwards.
352 352 * This interface is provided for binary compatibility. Using
353 353 * BWRITE2() directly avoids the extra function call overhead.
354 354 */
355 355 void
356 356 bwrite2(struct buf *bp)
357 357 {
358 358 BWRITE2(bp);
359 359 }
360 360
361 361 /*
362 362 * Release the buffer, marking it so that if it is grabbed
363 363 * for another purpose it will be written out before being
364 364 * given up (e.g. when writing a partial block where it is
365 365 * assumed that another write for the same block will soon follow).
366 366 * Also save the time that the block is first marked as delayed
367 367 * so that it will be written in a reasonable time.
368 368 */
369 369 void
370 370 bdwrite(struct buf *bp)
371 371 {
372 372 ASSERT(SEMA_HELD(&bp->b_sem));
373 373 CPU_STATS_ADD_K(sys, lwrite, 1);
374 374 if ((bp->b_flags & B_DELWRI) == 0)
375 375 bp->b_start = ddi_get_lbolt();
376 376 /*
377 377 * B_DONE allows others to use the buffer, B_DELWRI causes the
378 378 * buffer to be written before being reused, and setting b_resid
379 379 * to zero says the buffer is complete.
380 380 */
381 381 bp->b_flags |= B_DELWRI | B_DONE;
382 382 bp->b_resid = 0;
383 383 brelse(bp);
384 384 }
385 385
386 386 /*
387 387 * Release the buffer, start I/O on it, but don't wait for completion.
388 388 */
389 389 void
390 390 bawrite(struct buf *bp)
391 391 {
392 392 ASSERT(SEMA_HELD(&bp->b_sem));
393 393
394 394 /* Use bfreelist.b_bcount as a weird-ass heuristic */
395 395 if (bfreelist.b_bcount > 4)
396 396 bp->b_flags |= B_ASYNC;
397 397 BWRITE(bp);
398 398 }
399 399
400 400 /*
401 401 * Release the buffer, with no I/O implied.
402 402 */
403 403 void
404 404 brelse(struct buf *bp)
405 405 {
406 406 struct buf **backp;
407 407 uint_t index;
408 408 kmutex_t *hmp;
409 409 struct buf *dp;
410 410 struct hbuf *hp;
411 411
412 412
413 413 ASSERT(SEMA_HELD(&bp->b_sem));
414 414
415 415 /*
416 416 * Clear the retry write flag if the buffer was written without
417 417 * error. The presence of B_DELWRI means the buffer has not yet
418 418 * been written and the presence of B_ERROR means that an error
419 419 * is still occurring.
420 420 */
421 421 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
422 422 bp->b_flags &= ~B_RETRYWRI;
423 423 }
424 424
425 425 /* Check for anomalous conditions */
426 426 if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
427 427 if (bp->b_flags & B_NOCACHE) {
428 428 /* Don't add to the freelist. Destroy it now */
429 429 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
430 430 sema_destroy(&bp->b_sem);
431 431 sema_destroy(&bp->b_io);
432 432 kmem_free(bp, sizeof (struct buf));
433 433 return;
434 434 }
435 435 /*
436 436 * If a write failed and we are supposed to retry write,
437 437 * don't toss the buffer. Keep it around and mark it
438 438 * delayed write in the hopes that it will eventually
439 439 * get flushed (and still keep the system running.)
440 440 */
441 441 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
442 442 bp->b_flags |= B_DELWRI;
443 443 /* keep fsflush from trying continuously to flush */
444 444 bp->b_start = ddi_get_lbolt();
445 445 } else
446 446 bp->b_flags |= B_AGE|B_STALE;
447 447 bp->b_flags &= ~B_ERROR;
448 448 bp->b_error = 0;
449 449 }
450 450
451 451 /*
452 452 * If delayed write is set then put in on the delayed
453 453 * write list instead of the free buffer list.
454 454 */
455 455 index = bio_bhash(bp->b_edev, bp->b_blkno);
456 456 hmp = &hbuf[index].b_lock;
457 457
458 458 mutex_enter(hmp);
459 459 hp = &hbuf[index];
460 460 dp = (struct buf *)hp;
461 461
462 462 /*
463 463 * Make sure that the number of entries on this list are
464 464 * Zero <= count <= total # buffers
465 465 */
466 466 ASSERT(hp->b_length >= 0);
467 467 ASSERT(hp->b_length < nbuf);
468 468
469 469 hp->b_length++; /* We are adding this buffer */
470 470
471 471 if (bp->b_flags & B_DELWRI) {
472 472 /*
473 473 * This buffer goes on the delayed write buffer list
474 474 */
475 475 dp = (struct buf *)&dwbuf[index];
476 476 }
477 477 ASSERT(bp->b_bufsize > 0);
478 478 ASSERT(bp->b_bcount > 0);
479 479 ASSERT(bp->b_un.b_addr != NULL);
480 480
481 481 if (bp->b_flags & B_AGE) {
482 482 backp = &dp->av_forw;
483 483 (*backp)->av_back = bp;
484 484 bp->av_forw = *backp;
485 485 *backp = bp;
486 486 bp->av_back = dp;
487 487 } else {
488 488 backp = &dp->av_back;
489 489 (*backp)->av_forw = bp;
490 490 bp->av_back = *backp;
491 491 *backp = bp;
492 492 bp->av_forw = dp;
493 493 }
494 494 mutex_exit(hmp);
495 495
496 496 if (bfreelist.b_flags & B_WANTED) {
497 497 /*
498 498 * Should come here very very rarely.
499 499 */
500 500 mutex_enter(&bfree_lock);
501 501 if (bfreelist.b_flags & B_WANTED) {
502 502 bfreelist.b_flags &= ~B_WANTED;
503 503 cv_broadcast(&bio_mem_cv);
504 504 }
505 505 mutex_exit(&bfree_lock);
506 506 }
507 507
508 508 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
509 509 /*
510 510 * Don't let anyone get the buffer off the freelist before we
511 511 * release our hold on it.
512 512 */
513 513 sema_v(&bp->b_sem);
514 514 }
515 515
516 516 /*
517 517 * Return a count of the number of B_BUSY buffers in the system
518 518 * Can only be used as a good estimate. If 'cleanit' is set,
519 519 * try to flush all bufs.
520 520 */
521 521 int
522 522 bio_busy(int cleanit)
523 523 {
524 524 struct buf *bp, *dp;
525 525 int busy = 0;
526 526 int i;
527 527 kmutex_t *hmp;
528 528
529 529 for (i = 0; i < v.v_hbuf; i++) {
530 530 vfs_syncprogress();
531 531 dp = (struct buf *)&hbuf[i];
532 532 hmp = &hbuf[i].b_lock;
533 533
534 534 mutex_enter(hmp);
535 535 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
536 536 if (bp->b_flags & B_BUSY)
537 537 busy++;
538 538 }
539 539 mutex_exit(hmp);
540 540 }
541 541
542 542 if (cleanit && busy != 0) {
543 543 bflush(NODEV);
544 544 }
545 545
546 546 return (busy);
547 547 }
548 548
549 549 /*
550 550 * this interface is provided for binary compatibility.
551 551 *
552 552 * Assign a buffer for the given block. If the appropriate
553 553 * block is already associated, return it; otherwise search
554 554 * for the oldest non-busy buffer and reassign it.
555 555 */
556 556 struct buf *
557 557 getblk(dev_t dev, daddr_t blkno, long bsize)
558 558 {
559 559 return (getblk_common(/* ufsvfsp */ NULL, dev,
560 560 blkno, bsize, /* errflg */ 0));
561 561 }
562 562
563 563 /*
564 564 * Assign a buffer for the given block. If the appropriate
565 565 * block is already associated, return it; otherwise search
566 566 * for the oldest non-busy buffer and reassign it.
567 567 */
568 568 struct buf *
569 569 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
570 570 {
571 571 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
572 572 struct buf *bp;
573 573 struct buf *dp;
574 574 struct buf *nbp = NULL;
575 575 struct buf *errbp;
576 576 uint_t index;
577 577 kmutex_t *hmp;
578 578 struct hbuf *hp;
579 579
580 580 if (getmajor(dev) >= devcnt)
581 581 cmn_err(CE_PANIC, "blkdev");
582 582
583 583 biostats.bio_lookup.value.ui32++;
584 584
585 585 index = bio_bhash(dev, blkno);
586 586 hp = &hbuf[index];
587 587 dp = (struct buf *)hp;
588 588 hmp = &hp->b_lock;
589 589
590 590 mutex_enter(hmp);
591 591 loop:
592 592 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
593 593 if (bp->b_blkno != blkno || bp->b_edev != dev ||
594 594 (bp->b_flags & B_STALE))
595 595 continue;
596 596 /*
597 597 * Avoid holding the hash lock in the event that
598 598 * the buffer is locked by someone. Since the hash chain
599 599 * may change when we drop the hash lock
600 600 * we have to start at the beginning of the chain if the
601 601 * buffer identity/contents aren't valid.
602 602 */
603 603 if (!sema_tryp(&bp->b_sem)) {
604 604 biostats.bio_bufbusy.value.ui32++;
605 605 mutex_exit(hmp);
606 606 /*
607 607 * OK, we are dealing with a busy buffer.
608 608 * In the case that we are panicking and we
609 609 * got called from bread(), we have some chance
610 610 * for error recovery. So better bail out from
611 611 * here since sema_p() won't block. If we got
612 612 * called directly from ufs routines, there is
613 613 * no way to report an error yet.
614 614 */
615 615 if (panicstr && errflg)
616 616 goto errout;
617 617 /*
618 618 * For the following line of code to work
619 619 * correctly never kmem_free the buffer "header".
620 620 */
621 621 sema_p(&bp->b_sem);
622 622 if (bp->b_blkno != blkno || bp->b_edev != dev ||
623 623 (bp->b_flags & B_STALE)) {
624 624 sema_v(&bp->b_sem);
625 625 mutex_enter(hmp);
626 626 goto loop; /* start over */
627 627 }
628 628 mutex_enter(hmp);
629 629 }
630 630 /* Found */
631 631 biostats.bio_hit.value.ui32++;
632 632 bp->b_flags &= ~B_AGE;
633 633
634 634 /*
635 635 * Yank it off the free/delayed write lists
636 636 */
637 637 hp->b_length--;
638 638 notavail(bp);
639 639 mutex_exit(hmp);
640 640
641 641 ASSERT((bp->b_flags & B_NOCACHE) == NULL);
642 642
643 643 if (nbp == NULL) {
644 644 /*
645 645 * Make the common path short.
646 646 */
647 647 ASSERT(SEMA_HELD(&bp->b_sem));
648 648 return (bp);
649 649 }
650 650
651 651 biostats.bio_bufdup.value.ui32++;
652 652
653 653 /*
654 654 * The buffer must have entered during the lock upgrade
655 655 * so free the new buffer we allocated and return the
656 656 * found buffer.
657 657 */
658 658 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
659 659 nbp->b_un.b_addr = NULL;
660 660
661 661 /*
662 662 * Account for the memory
663 663 */
664 664 mutex_enter(&bfree_lock);
665 665 bfreelist.b_bufsize += nbp->b_bufsize;
666 666 mutex_exit(&bfree_lock);
667 667
668 668 /*
669 669 * Destroy buf identity, and place on avail list
670 670 */
671 671 nbp->b_dev = (o_dev_t)NODEV;
672 672 nbp->b_edev = NODEV;
673 673 nbp->b_flags = 0;
674 674 nbp->b_file = NULL;
675 675 nbp->b_offset = -1;
676 676
677 677 sema_v(&nbp->b_sem);
678 678 bio_bhdr_free(nbp);
679 679
680 680 ASSERT(SEMA_HELD(&bp->b_sem));
681 681 return (bp);
682 682 }
683 683
684 684 /*
685 685 * bio_getfreeblk may block so check the hash chain again.
686 686 */
687 687 if (nbp == NULL) {
688 688 mutex_exit(hmp);
689 689 nbp = bio_getfreeblk(bsize);
690 690 mutex_enter(hmp);
691 691 goto loop;
692 692 }
693 693
694 694 /*
695 695 * New buffer. Assign nbp and stick it on the hash.
696 696 */
697 697 nbp->b_flags = B_BUSY;
698 698 nbp->b_edev = dev;
699 699 nbp->b_dev = (o_dev_t)cmpdev(dev);
700 700 nbp->b_blkno = blkno;
701 701 nbp->b_iodone = NULL;
702 702 nbp->b_bcount = bsize;
703 703 /*
704 704 * If we are given a ufsvfsp and the vfs_root field is NULL
705 705 * then this must be I/O for a superblock. A superblock's
706 706 * buffer is set up in mountfs() and there is no root vnode
707 707 * at that point.
708 708 */
709 709 if (ufsvfsp && ufsvfsp->vfs_root) {
710 710 nbp->b_vp = ufsvfsp->vfs_root;
711 711 } else {
712 712 nbp->b_vp = NULL;
713 713 }
714 714
715 715 ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
716 716
717 717 binshash(nbp, dp);
718 718 mutex_exit(hmp);
719 719
720 720 ASSERT(SEMA_HELD(&nbp->b_sem));
721 721
722 722 return (nbp);
723 723
724 724
725 725 /*
726 726 * Come here in case of an internal error. At this point we couldn't
727 727 * get a buffer, but he have to return one. Hence we allocate some
728 728 * kind of error reply buffer on the fly. This buffer is marked as
729 729 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
730 730 * - B_ERROR will indicate error to the caller.
731 731 * - B_DONE will prevent us from reading the buffer from
732 732 * the device.
733 733 * - B_NOCACHE will cause that this buffer gets free'd in
734 734 * brelse().
735 735 */
736 736
737 737 errout:
738 738 errbp = geteblk();
739 739 sema_p(&errbp->b_sem);
740 740 errbp->b_flags &= ~B_BUSY;
741 741 errbp->b_flags |= (B_ERROR | B_DONE);
742 742 return (errbp);
743 743 }
744 744
745 745 /*
746 746 * Get an empty block, not assigned to any particular device.
747 747 * Returns a locked buffer that is not on any hash or free list.
748 748 */
749 749 struct buf *
750 750 ngeteblk(long bsize)
751 751 {
752 752 struct buf *bp;
753 753
754 754 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
755 755 bioinit(bp);
756 756 bp->av_forw = bp->av_back = NULL;
757 757 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
758 758 bp->b_bufsize = bsize;
759 759 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
760 760 bp->b_dev = (o_dev_t)NODEV;
761 761 bp->b_edev = NODEV;
762 762 bp->b_lblkno = 0;
763 763 bp->b_bcount = bsize;
764 764 bp->b_iodone = NULL;
765 765 return (bp);
766 766 }
767 767
768 768 /*
769 769 * Interface of geteblk() is kept intact to maintain driver compatibility.
770 770 * Use ngeteblk() to allocate block size other than 1 KB.
771 771 */
772 772 struct buf *
773 773 geteblk(void)
774 774 {
775 775 return (ngeteblk((long)1024));
776 776 }
777 777
778 778 /*
779 779 * Return a buffer w/o sleeping
780 780 */
781 781 struct buf *
782 782 trygetblk(dev_t dev, daddr_t blkno)
783 783 {
784 784 struct buf *bp;
785 785 struct buf *dp;
786 786 struct hbuf *hp;
787 787 kmutex_t *hmp;
788 788 uint_t index;
789 789
790 790 index = bio_bhash(dev, blkno);
791 791 hp = &hbuf[index];
792 792 hmp = &hp->b_lock;
793 793
794 794 if (!mutex_tryenter(hmp))
795 795 return (NULL);
796 796
797 797 dp = (struct buf *)hp;
798 798 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
799 799 if (bp->b_blkno != blkno || bp->b_edev != dev ||
800 800 (bp->b_flags & B_STALE))
801 801 continue;
802 802 /*
803 803 * Get access to a valid buffer without sleeping
804 804 */
805 805 if (sema_tryp(&bp->b_sem)) {
806 806 if (bp->b_flags & B_DONE) {
807 807 hp->b_length--;
808 808 notavail(bp);
809 809 mutex_exit(hmp);
810 810 return (bp);
811 811 } else {
812 812 sema_v(&bp->b_sem);
813 813 break;
814 814 }
815 815 }
816 816 break;
817 817 }
818 818 mutex_exit(hmp);
819 819 return (NULL);
820 820 }
821 821
822 822 /*
823 823 * Wait for I/O completion on the buffer; return errors
824 824 * to the user.
825 825 */
826 826 int
827 827 iowait(struct buf *bp)
828 828 {
829 829 ASSERT(SEMA_HELD(&bp->b_sem));
830 830 return (biowait(bp));
831 831 }
832 832
833 833 /*
834 834 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
835 835 * and wake up anyone waiting for it.
836 836 */
837 837 void
838 838 iodone(struct buf *bp)
839 839 {
840 840 ASSERT(SEMA_HELD(&bp->b_sem));
841 841 (void) biodone(bp);
842 842 }
843 843
844 844 /*
845 845 * Zero the core associated with a buffer.
846 846 */
847 847 void
848 848 clrbuf(struct buf *bp)
849 849 {
850 850 ASSERT(SEMA_HELD(&bp->b_sem));
851 851 bzero(bp->b_un.b_addr, bp->b_bcount);
852 852 bp->b_resid = 0;
853 853 }
854 854
855 855
856 856 /*
857 857 * Make sure all write-behind blocks on dev (or NODEV for all)
858 858 * are flushed out.
859 859 */
860 860 void
861 861 bflush(dev_t dev)
862 862 {
863 863 struct buf *bp, *dp;
864 864 struct hbuf *hp;
865 865 struct buf *delwri_list = EMPTY_LIST;
866 866 int i, index;
867 867 kmutex_t *hmp;
868 868
869 869 mutex_enter(&blist_lock);
870 870 /*
871 871 * Wait for any invalidates or flushes ahead of us to finish.
872 872 * We really could split blist_lock up per device for better
873 873 * parallelism here.
874 874 */
875 875 while (bio_doinginval || bio_doingflush) {
876 876 bio_flinv_cv_wanted = 1;
877 877 cv_wait(&bio_flushinval_cv, &blist_lock);
878 878 }
879 879 bio_doingflush++;
880 880 /*
881 881 * Gather all B_DELWRI buffer for device.
882 882 * Lock ordering is b_sem > hash lock (brelse).
883 883 * Since we are finding the buffer via the delayed write list,
884 884 * it may be busy and we would block trying to get the
885 885 * b_sem lock while holding hash lock. So transfer all the
886 886 * candidates on the delwri_list and then drop the hash locks.
887 887 */
888 888 for (i = 0; i < v.v_hbuf; i++) {
889 889 vfs_syncprogress();
890 890 hmp = &hbuf[i].b_lock;
891 891 dp = (struct buf *)&dwbuf[i];
892 892 mutex_enter(hmp);
893 893 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
894 894 if (dev == NODEV || bp->b_edev == dev) {
895 895 if (bp->b_list == NULL) {
896 896 bp->b_list = delwri_list;
897 897 delwri_list = bp;
898 898 }
899 899 }
900 900 }
901 901 mutex_exit(hmp);
902 902 }
903 903 mutex_exit(&blist_lock);
904 904
905 905 /*
906 906 * Now that the hash locks have been dropped grab the semaphores
907 907 * and write back all the buffers that have B_DELWRI set.
908 908 */
909 909 while (delwri_list != EMPTY_LIST) {
910 910 vfs_syncprogress();
911 911 bp = delwri_list;
912 912
913 913 sema_p(&bp->b_sem); /* may block */
914 914 if ((dev != bp->b_edev && dev != NODEV) ||
915 915 (panicstr && bp->b_flags & B_BUSY)) {
916 916 sema_v(&bp->b_sem);
917 917 delwri_list = bp->b_list;
918 918 bp->b_list = NULL;
919 919 continue; /* No longer a candidate */
920 920 }
921 921 if (bp->b_flags & B_DELWRI) {
922 922 index = bio_bhash(bp->b_edev, bp->b_blkno);
923 923 hp = &hbuf[index];
924 924 hmp = &hp->b_lock;
925 925 dp = (struct buf *)hp;
926 926
927 927 bp->b_flags |= B_ASYNC;
928 928 mutex_enter(hmp);
929 929 hp->b_length--;
930 930 notavail(bp);
931 931 mutex_exit(hmp);
932 932 if (bp->b_vp == NULL) { /* !ufs */
933 933 BWRITE(bp);
934 934 } else { /* ufs */
935 935 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
936 936 }
937 937 } else {
938 938 sema_v(&bp->b_sem);
939 939 }
940 940 delwri_list = bp->b_list;
941 941 bp->b_list = NULL;
942 942 }
943 943 mutex_enter(&blist_lock);
944 944 bio_doingflush--;
945 945 if (bio_flinv_cv_wanted) {
946 946 bio_flinv_cv_wanted = 0;
947 947 cv_broadcast(&bio_flushinval_cv);
948 948 }
949 949 mutex_exit(&blist_lock);
950 950 }
951 951
952 952 /*
953 953 * Ensure that a specified block is up-to-date on disk.
954 954 */
955 955 void
956 956 blkflush(dev_t dev, daddr_t blkno)
957 957 {
958 958 struct buf *bp, *dp;
959 959 struct hbuf *hp;
960 960 struct buf *sbp = NULL;
961 961 uint_t index;
962 962 kmutex_t *hmp;
963 963
964 964 index = bio_bhash(dev, blkno);
965 965 hp = &hbuf[index];
966 966 dp = (struct buf *)hp;
967 967 hmp = &hp->b_lock;
968 968
969 969 /*
970 970 * Identify the buffer in the cache belonging to
971 971 * this device and blkno (if any).
972 972 */
973 973 mutex_enter(hmp);
974 974 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
975 975 if (bp->b_blkno != blkno || bp->b_edev != dev ||
976 976 (bp->b_flags & B_STALE))
977 977 continue;
978 978 sbp = bp;
979 979 break;
980 980 }
981 981 mutex_exit(hmp);
982 982 if (sbp == NULL)
983 983 return;
984 984 /*
985 985 * Now check the buffer we have identified and
986 986 * make sure it still belongs to the device and is B_DELWRI
987 987 */
988 988 sema_p(&sbp->b_sem);
989 989 if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
990 990 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
991 991 mutex_enter(hmp);
992 992 hp->b_length--;
993 993 notavail(sbp);
994 994 mutex_exit(hmp);
995 995 /*
996 996 * XXX - There is nothing to guarantee a synchronous
997 997 * write here if the B_ASYNC flag is set. This needs
998 998 * some investigation.
999 999 */
1000 1000 if (sbp->b_vp == NULL) { /* !ufs */
1001 1001 BWRITE(sbp); /* synchronous write */
1002 1002 } else { /* ufs */
1003 1003 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1004 1004 }
1005 1005 } else {
1006 1006 sema_v(&sbp->b_sem);
1007 1007 }
1008 1008 }
1009 1009
1010 1010 /*
1011 1011 * Same as binval, except can force-invalidate delayed-write buffers
1012 1012 * (which are not be already flushed because of device errors). Also
1013 1013 * makes sure that the retry write flag is cleared.
1014 1014 */
1015 1015 int
1016 1016 bfinval(dev_t dev, int force)
1017 1017 {
1018 1018 struct buf *dp;
1019 1019 struct buf *bp;
1020 1020 struct buf *binval_list = EMPTY_LIST;
1021 1021 int i, error = 0;
1022 1022 kmutex_t *hmp;
1023 1023 uint_t index;
1024 1024 struct buf **backp;
1025 1025
1026 1026 mutex_enter(&blist_lock);
1027 1027 /*
1028 1028 * Wait for any flushes ahead of us to finish, it's ok to
1029 1029 * do invalidates in parallel.
1030 1030 */
1031 1031 while (bio_doingflush) {
1032 1032 bio_flinv_cv_wanted = 1;
1033 1033 cv_wait(&bio_flushinval_cv, &blist_lock);
1034 1034 }
1035 1035 bio_doinginval++;
1036 1036
1037 1037 /* Gather bp's */
1038 1038 for (i = 0; i < v.v_hbuf; i++) {
1039 1039 dp = (struct buf *)&hbuf[i];
1040 1040 hmp = &hbuf[i].b_lock;
1041 1041
1042 1042 mutex_enter(hmp);
1043 1043 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1044 1044 if (bp->b_edev == dev) {
1045 1045 if (bp->b_list == NULL) {
1046 1046 bp->b_list = binval_list;
1047 1047 binval_list = bp;
1048 1048 }
1049 1049 }
1050 1050 }
1051 1051 mutex_exit(hmp);
1052 1052 }
1053 1053 mutex_exit(&blist_lock);
1054 1054
1055 1055 /* Invalidate all bp's found */
1056 1056 while (binval_list != EMPTY_LIST) {
1057 1057 bp = binval_list;
1058 1058
1059 1059 sema_p(&bp->b_sem);
1060 1060 if (bp->b_edev == dev) {
1061 1061 if (force && (bp->b_flags & B_DELWRI)) {
1062 1062 /* clear B_DELWRI, move to non-dw freelist */
1063 1063 index = bio_bhash(bp->b_edev, bp->b_blkno);
1064 1064 hmp = &hbuf[index].b_lock;
1065 1065 dp = (struct buf *)&hbuf[index];
1066 1066 mutex_enter(hmp);
1067 1067
1068 1068 /* remove from delayed write freelist */
1069 1069 notavail(bp);
1070 1070
1071 1071 /* add to B_AGE side of non-dw freelist */
1072 1072 backp = &dp->av_forw;
1073 1073 (*backp)->av_back = bp;
1074 1074 bp->av_forw = *backp;
1075 1075 *backp = bp;
1076 1076 bp->av_back = dp;
1077 1077
1078 1078 /*
1079 1079 * make sure write retries and busy are cleared
1080 1080 */
1081 1081 bp->b_flags &=
1082 1082 ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1083 1083 mutex_exit(hmp);
1084 1084 }
1085 1085 if ((bp->b_flags & B_DELWRI) == 0)
1086 1086 bp->b_flags |= B_STALE|B_AGE;
1087 1087 else
1088 1088 error = EIO;
1089 1089 }
1090 1090 sema_v(&bp->b_sem);
1091 1091 binval_list = bp->b_list;
1092 1092 bp->b_list = NULL;
1093 1093 }
1094 1094 mutex_enter(&blist_lock);
1095 1095 bio_doinginval--;
1096 1096 if (bio_flinv_cv_wanted) {
1097 1097 cv_broadcast(&bio_flushinval_cv);
1098 1098 bio_flinv_cv_wanted = 0;
1099 1099 }
1100 1100 mutex_exit(&blist_lock);
1101 1101 return (error);
1102 1102 }
1103 1103
1104 1104 /*
1105 1105 * If possible, invalidate blocks for a dev on demand
1106 1106 */
1107 1107 void
1108 1108 binval(dev_t dev)
1109 1109 {
1110 1110 (void) bfinval(dev, 0);
1111 1111 }
1112 1112
1113 1113 /*
1114 1114 * Initialize the buffer I/O system by freeing
1115 1115 * all buffers and setting all device hash buffer lists to empty.
1116 1116 */
1117 1117 void
1118 1118 binit(void)
1119 1119 {
1120 1120 struct buf *bp;
1121 1121 unsigned int i, pct;
1122 1122 ulong_t bio_max_hwm, bio_default_hwm;
1123 1123
1124 1124 /*
1125 1125 * Maximum/Default values for bufhwm are set to the smallest of:
1126 1126 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1127 1127 * - 1/4 of kernel virtual memory
1128 1128 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1129 1129 * Additionally, in order to allow simple tuning by percentage of
1130 1130 * physical memory, bufhwm_pct is used to calculate the default if
1131 1131 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1132 1132 *
1133 1133 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1134 1134 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1135 1135 */
1136 1136 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1137 1137 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1138 1138 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1139 1139
1140 1140 pct = BIO_BUF_PERCENT;
1141 1141 if (bufhwm_pct != 0 &&
1142 1142 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1143 1143 pct = BIO_BUF_PERCENT;
1144 1144 /*
1145 1145 * Invalid user specified value, emit a warning.
1146 1146 */
1147 1147 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1148 1148 range(1..%d). Using %d as default.",
1149 1149 bufhwm_pct,
1150 1150 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1151 1151 }
1152 1152
1153 1153 bio_default_hwm = MIN(physmem / pct,
1154 1154 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1155 1155 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1156 1156
1157 1157 if ((v.v_bufhwm = bufhwm) == 0)
1158 1158 v.v_bufhwm = bio_default_hwm;
1159 1159
1160 1160 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1161 1161 v.v_bufhwm = (int)bio_max_hwm;
1162 1162 /*
1163 1163 * Invalid user specified value, emit a warning.
1164 1164 */
1165 1165 cmn_err(CE_WARN,
1166 1166 "binit: bufhwm(%d) out \
1167 1167 of range(%d..%lu). Using %lu as default",
1168 1168 bufhwm,
1169 1169 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1170 1170 }
1171 1171
1172 1172 /*
1173 1173 * Determine the number of hash buckets. Default is to
1174 1174 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1175 1175 * Round up number to the next power of 2.
1176 1176 */
1177 1177 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1178 1178 BIO_HASHLEN);
1179 1179 v.v_hmask = v.v_hbuf - 1;
1180 1180 v.v_buf = BIO_BHDR_POOL;
1181 1181
1182 1182 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1183 1183
1184 1184 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1185 1185
1186 1186 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1187 1187 bp = &bfreelist;
1188 1188 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1189 1189
1190 1190 for (i = 0; i < v.v_hbuf; i++) {
1191 1191 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1192 1192 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1193 1193
1194 1194 /*
1195 1195 * Initialize the delayed write buffer list.
1196 1196 */
1197 1197 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1198 1198 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1199 1199 }
1200 1200 }
1201 1201
1202 1202 /*
1203 1203 * Wait for I/O completion on the buffer; return error code.
1204 1204 * If bp was for synchronous I/O, bp is invalid and associated
1205 1205 * resources are freed on return.
↓ open down ↓ |
1205 lines elided |
↑ open up ↑ |
1206 1206 */
1207 1207 int
1208 1208 biowait(struct buf *bp)
1209 1209 {
1210 1210 int error = 0;
1211 1211 struct cpu *cpup;
1212 1212
1213 1213 ASSERT(SEMA_HELD(&bp->b_sem));
1214 1214
1215 1215 cpup = CPU;
1216 - atomic_add_64(&cpup->cpu_stats.sys.iowait, 1);
1216 + atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1217 1217 DTRACE_IO1(wait__start, struct buf *, bp);
1218 1218
1219 1219 /*
1220 1220 * In case of panic, busy wait for completion
1221 1221 */
1222 1222 if (panicstr) {
1223 1223 while ((bp->b_flags & B_DONE) == 0)
1224 1224 drv_usecwait(10);
1225 1225 } else
1226 1226 sema_p(&bp->b_io);
1227 1227
1228 1228 DTRACE_IO1(wait__done, struct buf *, bp);
1229 - atomic_add_64(&cpup->cpu_stats.sys.iowait, -1);
1229 + atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1230 1230
1231 1231 error = geterror(bp);
1232 1232 if ((bp->b_flags & B_ASYNC) == 0) {
1233 1233 if (bp->b_flags & B_REMAPPED)
1234 1234 bp_mapout(bp);
1235 1235 }
1236 1236 return (error);
1237 1237 }
1238 1238
1239 1239 static void
1240 1240 biodone_tnf_probe(struct buf *bp)
1241 1241 {
1242 1242 /* Kernel probe */
1243 1243 TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1244 1244 tnf_device, device, bp->b_edev,
1245 1245 tnf_diskaddr, block, bp->b_lblkno,
1246 1246 tnf_opaque, buf, bp);
1247 1247 }
1248 1248
1249 1249 /*
1250 1250 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1251 1251 * and wake up anyone waiting for it.
1252 1252 */
1253 1253 void
1254 1254 biodone(struct buf *bp)
1255 1255 {
1256 1256 if (bp->b_flags & B_STARTED) {
1257 1257 DTRACE_IO1(done, struct buf *, bp);
1258 1258 bp->b_flags &= ~B_STARTED;
1259 1259 }
1260 1260
1261 1261 /*
1262 1262 * Call the TNF probe here instead of the inline code
1263 1263 * to force our compiler to use the tail call optimization.
1264 1264 */
1265 1265 biodone_tnf_probe(bp);
1266 1266
1267 1267 if (bp->b_iodone != NULL) {
1268 1268 (*(bp->b_iodone))(bp);
1269 1269 return;
1270 1270 }
1271 1271 ASSERT((bp->b_flags & B_DONE) == 0);
1272 1272 ASSERT(SEMA_HELD(&bp->b_sem));
1273 1273 bp->b_flags |= B_DONE;
1274 1274 if (bp->b_flags & B_ASYNC) {
1275 1275 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1276 1276 bio_pageio_done(bp);
1277 1277 else
1278 1278 brelse(bp); /* release bp to freelist */
1279 1279 } else {
1280 1280 sema_v(&bp->b_io);
1281 1281 }
1282 1282 }
1283 1283
1284 1284 /*
1285 1285 * Pick up the device's error number and pass it to the user;
1286 1286 * if there is an error but the number is 0 set a generalized code.
1287 1287 */
1288 1288 int
1289 1289 geterror(struct buf *bp)
1290 1290 {
1291 1291 int error = 0;
1292 1292
1293 1293 ASSERT(SEMA_HELD(&bp->b_sem));
1294 1294 if (bp->b_flags & B_ERROR) {
1295 1295 error = bp->b_error;
1296 1296 if (!error)
1297 1297 error = EIO;
1298 1298 }
1299 1299 return (error);
1300 1300 }
1301 1301
1302 1302 /*
1303 1303 * Support for pageio buffers.
1304 1304 *
1305 1305 * This stuff should be generalized to provide a generalized bp
1306 1306 * header facility that can be used for things other than pageio.
1307 1307 */
1308 1308
1309 1309 /*
1310 1310 * Allocate and initialize a buf struct for use with pageio.
1311 1311 */
1312 1312 struct buf *
1313 1313 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1314 1314 {
1315 1315 struct buf *bp;
1316 1316 struct cpu *cpup;
1317 1317
1318 1318 if (flags & B_READ) {
1319 1319 CPU_STATS_ENTER_K();
1320 1320 cpup = CPU; /* get pointer AFTER preemption is disabled */
1321 1321 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1322 1322 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1323 1323 if ((flags & B_ASYNC) == 0) {
1324 1324 klwp_t *lwp = ttolwp(curthread);
1325 1325 if (lwp != NULL)
1326 1326 lwp->lwp_ru.majflt++;
1327 1327 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1328 1328 /* Kernel probe */
1329 1329 TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1330 1330 tnf_opaque, vnode, pp->p_vnode,
1331 1331 tnf_offset, offset, pp->p_offset);
1332 1332 }
1333 1333 /*
1334 1334 * Update statistics for pages being paged in
1335 1335 */
1336 1336 if (pp != NULL && pp->p_vnode != NULL) {
1337 1337 if (IS_SWAPFSVP(pp->p_vnode)) {
1338 1338 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1339 1339 } else {
1340 1340 if (pp->p_vnode->v_flag & VVMEXEC) {
1341 1341 CPU_STATS_ADDQ(cpup, vm, execpgin,
1342 1342 btopr(len));
1343 1343 } else {
1344 1344 CPU_STATS_ADDQ(cpup, vm, fspgin,
1345 1345 btopr(len));
1346 1346 }
1347 1347 }
1348 1348 }
1349 1349 CPU_STATS_EXIT_K();
1350 1350 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1351 1351 "page_ws_in:pp %p", pp);
1352 1352 /* Kernel probe */
1353 1353 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1354 1354 tnf_opaque, vnode, pp->p_vnode,
1355 1355 tnf_offset, offset, pp->p_offset,
1356 1356 tnf_size, size, len);
1357 1357 }
1358 1358
1359 1359 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1360 1360 bp->b_bcount = len;
1361 1361 bp->b_bufsize = len;
1362 1362 bp->b_pages = pp;
1363 1363 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1364 1364 bp->b_offset = -1;
1365 1365 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1366 1366
1367 1367 /* Initialize bp->b_sem in "locked" state */
1368 1368 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1369 1369
1370 1370 VN_HOLD(vp);
1371 1371 bp->b_vp = vp;
1372 1372 THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1373 1373
1374 1374 /*
1375 1375 * Caller sets dev & blkno and can adjust
1376 1376 * b_addr for page offset and can use bp_mapin
1377 1377 * to make pages kernel addressable.
1378 1378 */
1379 1379 return (bp);
1380 1380 }
1381 1381
1382 1382 void
1383 1383 pageio_done(struct buf *bp)
1384 1384 {
1385 1385 ASSERT(SEMA_HELD(&bp->b_sem));
1386 1386 if (bp->b_flags & B_REMAPPED)
1387 1387 bp_mapout(bp);
1388 1388 VN_RELE(bp->b_vp);
1389 1389 bp->b_vp = NULL;
1390 1390 ASSERT((bp->b_flags & B_NOCACHE) != 0);
1391 1391
1392 1392 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1393 1393 sema_destroy(&bp->b_sem);
1394 1394 sema_destroy(&bp->b_io);
1395 1395 kmem_free(bp, sizeof (struct buf));
1396 1396 }
1397 1397
1398 1398 /*
1399 1399 * Check to see whether the buffers, except the one pointed by sbp,
1400 1400 * associated with the device are busy.
1401 1401 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1402 1402 */
1403 1403 int
1404 1404 bcheck(dev_t dev, struct buf *sbp)
1405 1405 {
1406 1406 struct buf *bp;
1407 1407 struct buf *dp;
1408 1408 int i;
1409 1409 kmutex_t *hmp;
1410 1410
1411 1411 /*
1412 1412 * check for busy bufs for this filesystem
1413 1413 */
1414 1414 for (i = 0; i < v.v_hbuf; i++) {
1415 1415 dp = (struct buf *)&hbuf[i];
1416 1416 hmp = &hbuf[i].b_lock;
1417 1417
1418 1418 mutex_enter(hmp);
1419 1419 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1420 1420 /*
1421 1421 * if buf is busy or dirty, then filesystem is busy
1422 1422 */
1423 1423 if ((bp->b_edev == dev) &&
1424 1424 ((bp->b_flags & B_STALE) == 0) &&
1425 1425 (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1426 1426 (bp != sbp)) {
1427 1427 mutex_exit(hmp);
1428 1428 return (1);
1429 1429 }
1430 1430 }
1431 1431 mutex_exit(hmp);
1432 1432 }
1433 1433 return (0);
1434 1434 }
1435 1435
1436 1436 /*
1437 1437 * Hash two 32 bit entities.
1438 1438 */
1439 1439 int
1440 1440 hash2ints(int x, int y)
1441 1441 {
1442 1442 int hash = 0;
1443 1443
1444 1444 hash = x - 1;
1445 1445 hash = ((hash * 7) + (x >> 8)) - 1;
1446 1446 hash = ((hash * 7) + (x >> 16)) - 1;
1447 1447 hash = ((hash * 7) + (x >> 24)) - 1;
1448 1448 hash = ((hash * 7) + y) - 1;
1449 1449 hash = ((hash * 7) + (y >> 8)) - 1;
1450 1450 hash = ((hash * 7) + (y >> 16)) - 1;
1451 1451 hash = ((hash * 7) + (y >> 24)) - 1;
1452 1452
1453 1453 return (hash);
1454 1454 }
1455 1455
1456 1456
1457 1457 /*
1458 1458 * Return a new buffer struct.
1459 1459 * Create a new buffer if we haven't gone over our high water
1460 1460 * mark for memory, otherwise try to get one off the freelist.
1461 1461 *
1462 1462 * Returns a locked buf that has no id and is not on any hash or free
1463 1463 * list.
1464 1464 */
1465 1465 static struct buf *
1466 1466 bio_getfreeblk(long bsize)
1467 1467 {
1468 1468 struct buf *bp, *dp;
1469 1469 struct hbuf *hp;
1470 1470 kmutex_t *hmp;
1471 1471 uint_t start, end;
1472 1472
1473 1473 /*
1474 1474 * mutex_enter(&bfree_lock);
1475 1475 * bfreelist.b_bufsize represents the amount of memory
1476 1476 * mutex_exit(&bfree_lock); protect ref to bfreelist
1477 1477 * we are allowed to allocate in the cache before we hit our hwm.
1478 1478 */
1479 1479 bio_mem_get(bsize); /* Account for our memory request */
1480 1480
1481 1481 again:
1482 1482 bp = bio_bhdr_alloc(); /* Get a buf hdr */
1483 1483 sema_p(&bp->b_sem); /* Should never fail */
1484 1484
1485 1485 ASSERT(bp->b_un.b_addr == NULL);
1486 1486 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1487 1487 if (bp->b_un.b_addr != NULL) {
1488 1488 /*
1489 1489 * Make the common path short
1490 1490 */
1491 1491 bp->b_bufsize = bsize;
1492 1492 ASSERT(SEMA_HELD(&bp->b_sem));
1493 1493 return (bp);
1494 1494 } else {
1495 1495 struct buf *save;
1496 1496
1497 1497 save = bp; /* Save bp we allocated */
1498 1498 start = end = lastindex;
1499 1499
1500 1500 biostats.bio_bufwant.value.ui32++;
1501 1501
1502 1502 /*
1503 1503 * Memory isn't available from the system now. Scan
1504 1504 * the hash buckets till enough space is found.
1505 1505 */
1506 1506 do {
1507 1507 hp = &hbuf[start];
1508 1508 hmp = &hp->b_lock;
1509 1509 dp = (struct buf *)hp;
1510 1510
1511 1511 mutex_enter(hmp);
1512 1512 bp = dp->av_forw;
1513 1513
1514 1514 while (bp != dp) {
1515 1515
1516 1516 ASSERT(bp != NULL);
1517 1517
1518 1518 if (!sema_tryp(&bp->b_sem)) {
1519 1519 bp = bp->av_forw;
1520 1520 continue;
1521 1521 }
1522 1522
1523 1523 /*
1524 1524 * Since we are going down the freelist
1525 1525 * associated with this hash bucket the
1526 1526 * B_DELWRI flag should not be set.
1527 1527 */
1528 1528 ASSERT(!(bp->b_flags & B_DELWRI));
1529 1529
1530 1530 if (bp->b_bufsize == bsize) {
1531 1531 hp->b_length--;
1532 1532 notavail(bp);
1533 1533 bremhash(bp);
1534 1534 mutex_exit(hmp);
1535 1535
1536 1536 /*
1537 1537 * Didn't kmem_alloc any more, so don't
1538 1538 * count it twice.
1539 1539 */
1540 1540 mutex_enter(&bfree_lock);
1541 1541 bfreelist.b_bufsize += bsize;
1542 1542 mutex_exit(&bfree_lock);
1543 1543
1544 1544 /*
1545 1545 * Update the lastindex value.
1546 1546 */
1547 1547 lastindex = start;
1548 1548
1549 1549 /*
1550 1550 * Put our saved bp back on the list
1551 1551 */
1552 1552 sema_v(&save->b_sem);
1553 1553 bio_bhdr_free(save);
1554 1554 ASSERT(SEMA_HELD(&bp->b_sem));
1555 1555 return (bp);
1556 1556 }
1557 1557 sema_v(&bp->b_sem);
1558 1558 bp = bp->av_forw;
1559 1559 }
1560 1560 mutex_exit(hmp);
1561 1561 start = ((start + 1) % v.v_hbuf);
1562 1562 } while (start != end);
1563 1563
1564 1564 biostats.bio_bufwait.value.ui32++;
1565 1565 bp = save; /* Use original bp */
1566 1566 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1567 1567 }
1568 1568
1569 1569 bp->b_bufsize = bsize;
1570 1570 ASSERT(SEMA_HELD(&bp->b_sem));
1571 1571 return (bp);
1572 1572 }
1573 1573
1574 1574 /*
1575 1575 * Allocate a buffer header. If none currently available, allocate
1576 1576 * a new pool.
1577 1577 */
1578 1578 static struct buf *
1579 1579 bio_bhdr_alloc(void)
1580 1580 {
1581 1581 struct buf *dp, *sdp;
1582 1582 struct buf *bp;
1583 1583 int i;
1584 1584
1585 1585 for (;;) {
1586 1586 mutex_enter(&bhdr_lock);
1587 1587 if (bhdrlist != NULL) {
1588 1588 bp = bhdrlist;
1589 1589 bhdrlist = bp->av_forw;
1590 1590 mutex_exit(&bhdr_lock);
1591 1591 bp->av_forw = NULL;
1592 1592 return (bp);
1593 1593 }
1594 1594 mutex_exit(&bhdr_lock);
1595 1595
1596 1596 /*
1597 1597 * Need to allocate a new pool. If the system is currently
1598 1598 * out of memory, then try freeing things on the freelist.
1599 1599 */
1600 1600 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1601 1601 if (dp == NULL) {
1602 1602 /*
1603 1603 * System can't give us a pool of headers, try
1604 1604 * recycling from the free lists.
1605 1605 */
1606 1606 bio_recycle(BIO_HEADER, 0);
1607 1607 } else {
1608 1608 sdp = dp;
1609 1609 for (i = 0; i < v.v_buf; i++, dp++) {
1610 1610 /*
1611 1611 * The next two lines are needed since NODEV
1612 1612 * is -1 and not NULL
1613 1613 */
1614 1614 dp->b_dev = (o_dev_t)NODEV;
1615 1615 dp->b_edev = NODEV;
1616 1616 dp->av_forw = dp + 1;
1617 1617 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1618 1618 NULL);
1619 1619 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1620 1620 NULL);
1621 1621 dp->b_offset = -1;
1622 1622 }
1623 1623 mutex_enter(&bhdr_lock);
1624 1624 (--dp)->av_forw = bhdrlist; /* Fix last pointer */
1625 1625 bhdrlist = sdp;
1626 1626 nbuf += v.v_buf;
1627 1627 bp = bhdrlist;
1628 1628 bhdrlist = bp->av_forw;
1629 1629 mutex_exit(&bhdr_lock);
1630 1630
1631 1631 bp->av_forw = NULL;
1632 1632 return (bp);
1633 1633 }
1634 1634 }
1635 1635 }
1636 1636
1637 1637 static void
1638 1638 bio_bhdr_free(struct buf *bp)
1639 1639 {
1640 1640 ASSERT(bp->b_back == NULL);
1641 1641 ASSERT(bp->b_forw == NULL);
1642 1642 ASSERT(bp->av_back == NULL);
1643 1643 ASSERT(bp->av_forw == NULL);
1644 1644 ASSERT(bp->b_un.b_addr == NULL);
1645 1645 ASSERT(bp->b_dev == (o_dev_t)NODEV);
1646 1646 ASSERT(bp->b_edev == NODEV);
1647 1647 ASSERT(bp->b_flags == 0);
1648 1648
1649 1649 mutex_enter(&bhdr_lock);
1650 1650 bp->av_forw = bhdrlist;
1651 1651 bhdrlist = bp;
1652 1652 mutex_exit(&bhdr_lock);
1653 1653 }
1654 1654
1655 1655 /*
1656 1656 * If we haven't gone over the high water mark, it's o.k. to
1657 1657 * allocate more buffer space, otherwise recycle buffers
1658 1658 * from the freelist until enough memory is free for a bsize request.
1659 1659 *
1660 1660 * We account for this memory, even though
1661 1661 * we don't allocate it here.
1662 1662 */
1663 1663 static void
1664 1664 bio_mem_get(long bsize)
1665 1665 {
1666 1666 mutex_enter(&bfree_lock);
1667 1667 if (bfreelist.b_bufsize > bsize) {
1668 1668 bfreelist.b_bufsize -= bsize;
1669 1669 mutex_exit(&bfree_lock);
1670 1670 return;
1671 1671 }
1672 1672 mutex_exit(&bfree_lock);
1673 1673 bio_recycle(BIO_MEM, bsize);
1674 1674 }
1675 1675
1676 1676 /*
1677 1677 * flush a list of delayed write buffers.
1678 1678 * (currently used only by bio_recycle below.)
1679 1679 */
1680 1680 static void
1681 1681 bio_flushlist(struct buf *delwri_list)
1682 1682 {
1683 1683 struct buf *bp;
1684 1684
1685 1685 while (delwri_list != EMPTY_LIST) {
1686 1686 bp = delwri_list;
1687 1687 bp->b_flags |= B_AGE | B_ASYNC;
1688 1688 if (bp->b_vp == NULL) { /* !ufs */
1689 1689 BWRITE(bp);
1690 1690 } else { /* ufs */
1691 1691 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1692 1692 }
1693 1693 delwri_list = bp->b_list;
1694 1694 bp->b_list = NULL;
1695 1695 }
1696 1696 }
1697 1697
1698 1698 /*
1699 1699 * Start recycling buffers on the freelist for one of 2 reasons:
1700 1700 * - we need a buffer header
1701 1701 * - we need to free up memory
1702 1702 * Once started we continue to recycle buffers until the B_AGE
1703 1703 * buffers are gone.
1704 1704 */
1705 1705 static void
1706 1706 bio_recycle(int want, long bsize)
1707 1707 {
1708 1708 struct buf *bp, *dp, *dwp, *nbp;
1709 1709 struct hbuf *hp;
1710 1710 int found = 0;
1711 1711 kmutex_t *hmp;
1712 1712 int start, end;
1713 1713 struct buf *delwri_list = EMPTY_LIST;
1714 1714
1715 1715 /*
1716 1716 * Recycle buffers.
1717 1717 */
1718 1718 top:
1719 1719 start = end = lastindex;
1720 1720 do {
1721 1721 hp = &hbuf[start];
1722 1722 hmp = &hp->b_lock;
1723 1723 dp = (struct buf *)hp;
1724 1724
1725 1725 mutex_enter(hmp);
1726 1726 bp = dp->av_forw;
1727 1727
1728 1728 while (bp != dp) {
1729 1729
1730 1730 ASSERT(bp != NULL);
1731 1731
1732 1732 if (!sema_tryp(&bp->b_sem)) {
1733 1733 bp = bp->av_forw;
1734 1734 continue;
1735 1735 }
1736 1736 /*
1737 1737 * Do we really want to nuke all of the B_AGE stuff??
1738 1738 */
1739 1739 if ((bp->b_flags & B_AGE) == 0 && found) {
1740 1740 sema_v(&bp->b_sem);
1741 1741 mutex_exit(hmp);
1742 1742 lastindex = start;
1743 1743 return; /* All done */
1744 1744 }
1745 1745
1746 1746 ASSERT(MUTEX_HELD(&hp->b_lock));
1747 1747 ASSERT(!(bp->b_flags & B_DELWRI));
1748 1748 hp->b_length--;
1749 1749 notavail(bp);
1750 1750
1751 1751 /*
1752 1752 * Remove bhdr from cache, free up memory,
1753 1753 * and add the hdr to the freelist.
1754 1754 */
1755 1755 bremhash(bp);
1756 1756 mutex_exit(hmp);
1757 1757
1758 1758 if (bp->b_bufsize) {
1759 1759 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1760 1760 bp->b_un.b_addr = NULL;
1761 1761 mutex_enter(&bfree_lock);
1762 1762 bfreelist.b_bufsize += bp->b_bufsize;
1763 1763 mutex_exit(&bfree_lock);
1764 1764 }
1765 1765
1766 1766 bp->b_dev = (o_dev_t)NODEV;
1767 1767 bp->b_edev = NODEV;
1768 1768 bp->b_flags = 0;
1769 1769 sema_v(&bp->b_sem);
1770 1770 bio_bhdr_free(bp);
1771 1771 if (want == BIO_HEADER) {
1772 1772 found = 1;
1773 1773 } else {
1774 1774 ASSERT(want == BIO_MEM);
1775 1775 if (!found && bfreelist.b_bufsize >= bsize) {
1776 1776 /* Account for the memory we want */
1777 1777 mutex_enter(&bfree_lock);
1778 1778 if (bfreelist.b_bufsize >= bsize) {
1779 1779 bfreelist.b_bufsize -= bsize;
1780 1780 found = 1;
1781 1781 }
1782 1782 mutex_exit(&bfree_lock);
1783 1783 }
1784 1784 }
1785 1785
1786 1786 /*
1787 1787 * Since we dropped hmp start from the
1788 1788 * begining.
1789 1789 */
1790 1790 mutex_enter(hmp);
1791 1791 bp = dp->av_forw;
1792 1792 }
1793 1793 mutex_exit(hmp);
1794 1794
1795 1795 /*
1796 1796 * Look at the delayed write list.
1797 1797 * First gather into a private list, then write them.
1798 1798 */
1799 1799 dwp = (struct buf *)&dwbuf[start];
1800 1800 mutex_enter(&blist_lock);
1801 1801 bio_doingflush++;
1802 1802 mutex_enter(hmp);
1803 1803 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1804 1804
1805 1805 ASSERT(bp != NULL);
1806 1806 nbp = bp->av_forw;
1807 1807
1808 1808 if (!sema_tryp(&bp->b_sem))
1809 1809 continue;
1810 1810 ASSERT(bp->b_flags & B_DELWRI);
1811 1811 /*
1812 1812 * Do we really want to nuke all of the B_AGE stuff??
1813 1813 */
1814 1814
1815 1815 if ((bp->b_flags & B_AGE) == 0 && found) {
1816 1816 sema_v(&bp->b_sem);
1817 1817 mutex_exit(hmp);
1818 1818 lastindex = start;
1819 1819 mutex_exit(&blist_lock);
1820 1820 bio_flushlist(delwri_list);
1821 1821 mutex_enter(&blist_lock);
1822 1822 bio_doingflush--;
1823 1823 if (bio_flinv_cv_wanted) {
1824 1824 bio_flinv_cv_wanted = 0;
1825 1825 cv_broadcast(&bio_flushinval_cv);
1826 1826 }
1827 1827 mutex_exit(&blist_lock);
1828 1828 return; /* All done */
1829 1829 }
1830 1830
1831 1831 /*
1832 1832 * If the buffer is already on a flush or
1833 1833 * invalidate list then just skip it.
1834 1834 */
1835 1835 if (bp->b_list != NULL) {
1836 1836 sema_v(&bp->b_sem);
1837 1837 continue;
1838 1838 }
1839 1839 /*
1840 1840 * We are still on the same bucket.
1841 1841 */
1842 1842 hp->b_length--;
1843 1843 notavail(bp);
1844 1844 bp->b_list = delwri_list;
1845 1845 delwri_list = bp;
1846 1846 }
1847 1847 mutex_exit(hmp);
1848 1848 mutex_exit(&blist_lock);
1849 1849 bio_flushlist(delwri_list);
1850 1850 delwri_list = EMPTY_LIST;
1851 1851 mutex_enter(&blist_lock);
1852 1852 bio_doingflush--;
1853 1853 if (bio_flinv_cv_wanted) {
1854 1854 bio_flinv_cv_wanted = 0;
1855 1855 cv_broadcast(&bio_flushinval_cv);
1856 1856 }
1857 1857 mutex_exit(&blist_lock);
1858 1858 start = (start + 1) % v.v_hbuf;
1859 1859
1860 1860 } while (start != end);
1861 1861
1862 1862 if (found)
1863 1863 return;
1864 1864
1865 1865 /*
1866 1866 * Free lists exhausted and we haven't satisfied the request.
1867 1867 * Wait here for more entries to be added to freelist.
1868 1868 * Because this might have just happened, make it timed.
1869 1869 */
1870 1870 mutex_enter(&bfree_lock);
1871 1871 bfreelist.b_flags |= B_WANTED;
1872 1872 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1873 1873 mutex_exit(&bfree_lock);
1874 1874 goto top;
1875 1875 }
1876 1876
1877 1877 /*
1878 1878 * See if the block is associated with some buffer
1879 1879 * (mainly to avoid getting hung up on a wait in breada).
1880 1880 */
1881 1881 static int
1882 1882 bio_incore(dev_t dev, daddr_t blkno)
1883 1883 {
1884 1884 struct buf *bp;
1885 1885 struct buf *dp;
1886 1886 uint_t index;
1887 1887 kmutex_t *hmp;
1888 1888
1889 1889 index = bio_bhash(dev, blkno);
1890 1890 dp = (struct buf *)&hbuf[index];
1891 1891 hmp = &hbuf[index].b_lock;
1892 1892
1893 1893 mutex_enter(hmp);
1894 1894 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1895 1895 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1896 1896 (bp->b_flags & B_STALE) == 0) {
1897 1897 mutex_exit(hmp);
1898 1898 return (1);
1899 1899 }
1900 1900 }
1901 1901 mutex_exit(hmp);
1902 1902 return (0);
1903 1903 }
1904 1904
1905 1905 static void
1906 1906 bio_pageio_done(struct buf *bp)
1907 1907 {
1908 1908 if (bp->b_flags & B_PAGEIO) {
1909 1909
1910 1910 if (bp->b_flags & B_REMAPPED)
1911 1911 bp_mapout(bp);
1912 1912
1913 1913 if (bp->b_flags & B_READ)
1914 1914 pvn_read_done(bp->b_pages, bp->b_flags);
1915 1915 else
1916 1916 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1917 1917 pageio_done(bp);
1918 1918 } else {
1919 1919 ASSERT(bp->b_flags & B_REMAPPED);
1920 1920 bp_mapout(bp);
1921 1921 brelse(bp);
1922 1922 }
1923 1923 }
1924 1924
1925 1925 /*
1926 1926 * bioerror(9F) - indicate error in buffer header
1927 1927 * If 'error' is zero, remove the error indication.
1928 1928 */
1929 1929 void
1930 1930 bioerror(struct buf *bp, int error)
1931 1931 {
1932 1932 ASSERT(bp != NULL);
1933 1933 ASSERT(error >= 0);
1934 1934 ASSERT(SEMA_HELD(&bp->b_sem));
1935 1935
1936 1936 if (error != 0) {
1937 1937 bp->b_flags |= B_ERROR;
1938 1938 } else {
1939 1939 bp->b_flags &= ~B_ERROR;
1940 1940 }
1941 1941 bp->b_error = error;
1942 1942 }
1943 1943
1944 1944 /*
1945 1945 * bioreset(9F) - reuse a private buffer header after I/O is complete
1946 1946 */
1947 1947 void
1948 1948 bioreset(struct buf *bp)
1949 1949 {
1950 1950 ASSERT(bp != NULL);
1951 1951
1952 1952 biofini(bp);
1953 1953 bioinit(bp);
1954 1954 }
1955 1955
1956 1956 /*
1957 1957 * biosize(9F) - return size of a buffer header
1958 1958 */
1959 1959 size_t
1960 1960 biosize(void)
1961 1961 {
1962 1962 return (sizeof (struct buf));
1963 1963 }
1964 1964
1965 1965 /*
1966 1966 * biomodified(9F) - check if buffer is modified
1967 1967 */
1968 1968 int
1969 1969 biomodified(struct buf *bp)
1970 1970 {
1971 1971 int npf;
1972 1972 int ppattr;
1973 1973 struct page *pp;
1974 1974
1975 1975 ASSERT(bp != NULL);
1976 1976
1977 1977 if ((bp->b_flags & B_PAGEIO) == 0) {
1978 1978 return (-1);
1979 1979 }
1980 1980 pp = bp->b_pages;
1981 1981 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1982 1982
1983 1983 while (npf > 0) {
1984 1984 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1985 1985 HAT_SYNC_STOPON_MOD);
1986 1986 if (ppattr & P_MOD)
1987 1987 return (1);
1988 1988 pp = pp->p_next;
1989 1989 npf--;
1990 1990 }
1991 1991
1992 1992 return (0);
1993 1993 }
1994 1994
1995 1995 /*
1996 1996 * bioinit(9F) - initialize a buffer structure
1997 1997 */
1998 1998 void
1999 1999 bioinit(struct buf *bp)
2000 2000 {
2001 2001 bzero(bp, sizeof (struct buf));
2002 2002 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2003 2003 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2004 2004 bp->b_offset = -1;
2005 2005 }
2006 2006
2007 2007 /*
2008 2008 * biofini(9F) - uninitialize a buffer structure
2009 2009 */
2010 2010 void
2011 2011 biofini(struct buf *bp)
2012 2012 {
2013 2013 sema_destroy(&bp->b_io);
2014 2014 sema_destroy(&bp->b_sem);
2015 2015 }
2016 2016
2017 2017 /*
2018 2018 * bioclone(9F) - clone a buffer
2019 2019 */
2020 2020 struct buf *
2021 2021 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2022 2022 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2023 2023 {
2024 2024 struct buf *bufp;
2025 2025
2026 2026 ASSERT(bp);
2027 2027 if (bp_mem == NULL) {
2028 2028 bufp = kmem_alloc(sizeof (struct buf), sleep);
2029 2029 if (bufp == NULL) {
2030 2030 return (NULL);
2031 2031 }
2032 2032 bioinit(bufp);
2033 2033 } else {
2034 2034 bufp = bp_mem;
2035 2035 bioreset(bufp);
2036 2036 }
2037 2037
2038 2038 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2039 2039 B_ABRWRITE)
2040 2040
2041 2041 /*
2042 2042 * The cloned buffer does not inherit the B_REMAPPED flag.
2043 2043 */
2044 2044 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY;
2045 2045 bufp->b_bcount = len;
2046 2046 bufp->b_blkno = blkno;
2047 2047 bufp->b_iodone = iodone;
2048 2048 bufp->b_proc = bp->b_proc;
2049 2049 bufp->b_edev = dev;
2050 2050 bufp->b_file = bp->b_file;
2051 2051 bufp->b_offset = bp->b_offset;
2052 2052
2053 2053 if (bp->b_flags & B_SHADOW) {
2054 2054 ASSERT(bp->b_shadow);
2055 2055 ASSERT(bp->b_flags & B_PHYS);
2056 2056
2057 2057 bufp->b_shadow = bp->b_shadow +
2058 2058 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2059 2059 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2060 2060 if (bp->b_flags & B_REMAPPED)
2061 2061 bufp->b_proc = NULL;
2062 2062 } else {
2063 2063 if (bp->b_flags & B_PAGEIO) {
2064 2064 struct page *pp;
2065 2065 off_t o;
2066 2066 int i;
2067 2067
2068 2068 pp = bp->b_pages;
2069 2069 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2070 2070 for (i = btop(o); i > 0; i--) {
2071 2071 pp = pp->p_next;
2072 2072 }
2073 2073 bufp->b_pages = pp;
2074 2074 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2075 2075 } else {
2076 2076 bufp->b_un.b_addr =
2077 2077 (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2078 2078 if (bp->b_flags & B_REMAPPED)
2079 2079 bufp->b_proc = NULL;
2080 2080 }
2081 2081 }
2082 2082 return (bufp);
2083 2083 }
↓ open down ↓ |
844 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX