1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/dmu.h>
31 #include <sys/dmu_send.h>
32 #include <sys/dmu_impl.h>
33 #include <sys/dbuf.h>
34 #include <sys/dmu_objset.h>
35 #include <sys/dsl_dataset.h>
36 #include <sys/dsl_dir.h>
37 #include <sys/dmu_tx.h>
38 #include <sys/spa.h>
39 #include <sys/spa_impl.h>
40 #include <sys/zio.h>
41 #include <sys/dmu_zfetch.h>
42 #include <sys/sa.h>
43 #include <sys/sa_impl.h>
44 #include <sys/zfeature.h>
45 #include <sys/blkptr.h>
46 #include <sys/range_tree.h>
47
48 /*
49 * Number of times that zfs_free_range() took the slow path while doing
50 * a zfs receive. A nonzero value indicates a potential performance problem.
51 */
52 uint64_t zfs_free_range_recv_miss;
53
54 static void dbuf_destroy(dmu_buf_impl_t *db);
55 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
56 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
57
58 /*
59 * Global data structures and functions for the dbuf cache.
60 */
61 static kmem_cache_t *dbuf_cache;
62
63 /* ARGSUSED */
64 static int
65 dbuf_cons(void *vdb, void *unused, int kmflag)
66 {
67 dmu_buf_impl_t *db = vdb;
68 bzero(db, sizeof (dmu_buf_impl_t));
69
70 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
71 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
72 refcount_create(&db->db_holds);
73
74 return (0);
75 }
76
77 /* ARGSUSED */
78 static void
79 dbuf_dest(void *vdb, void *unused)
80 {
81 dmu_buf_impl_t *db = vdb;
82 mutex_destroy(&db->db_mtx);
83 cv_destroy(&db->db_changed);
84 refcount_destroy(&db->db_holds);
85 }
86
87 dmu_buf_impl_t *
88 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
89 {
90 objset_t *os = dn->dn_objset;
91 uint64_t obj = dn->dn_object;
92 dmu_buf_impl_t *db;
93 dmu_buf_impl_t key;
94 avl_index_t where;
95
96 key.db_level = level;
97 key.db_blkid = blkid;
98 key.db_state = DB_SEARCH;
99
100 mutex_enter(&dn->dn_dbufs_mtx);
101 db = avl_find(&dn->dn_dbufs, &key, &where);
102 ASSERT3P(db, ==, NULL);
103 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
104
105 for (; db; db = AVL_NEXT(&dn->dn_dbufs, db)) {
106 if ((db->db_level != level) || (db->db_blkid != blkid))
107 break;
108
109 mutex_enter(&db->db_mtx);
110 if (db->db_state != DB_EVICTING) {
111 mutex_exit(&dn->dn_dbufs_mtx);
112 return (db);
113 }
114 mutex_exit(&db->db_mtx);
115 }
116
117 mutex_exit(&dn->dn_dbufs_mtx);
118 return (NULL);
119 }
120
121 static arc_evict_func_t dbuf_do_evict;
122
123 static void
124 dbuf_evict_user(dmu_buf_impl_t *db)
125 {
126 ASSERT(MUTEX_HELD(&db->db_mtx));
127
128 if (db->db_level != 0 || db->db_evict_func == NULL)
129 return;
130
131 if (db->db_user_data_ptr_ptr)
132 *db->db_user_data_ptr_ptr = db->db.db_data;
133 db->db_evict_func(&db->db, db->db_user_ptr);
134 db->db_user_ptr = NULL;
135 db->db_user_data_ptr_ptr = NULL;
136 db->db_evict_func = NULL;
137 }
138
139 boolean_t
140 dbuf_is_metadata(dmu_buf_impl_t *db)
141 {
142 if (db->db_level > 0) {
143 return (B_TRUE);
144 } else {
145 boolean_t is_metadata;
146
147 DB_DNODE_ENTER(db);
148 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
149 DB_DNODE_EXIT(db);
150
151 return (is_metadata);
152 }
153 }
154
155 void
156 dbuf_evict(dmu_buf_impl_t *db)
157 {
158 ASSERT(MUTEX_HELD(&db->db_mtx));
159 ASSERT(db->db_buf == NULL);
160 ASSERT(db->db_data_pending == NULL);
161
162 dbuf_clear(db);
163 dbuf_destroy(db);
164 }
165
166 void
167 dbuf_init(void)
168 {
169 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
170 sizeof (dmu_buf_impl_t),
171 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
172 }
173
174 void
175 dbuf_fini(void)
176 {
177 kmem_cache_destroy(dbuf_cache);
178 }
179
180 /*
181 * Other stuff.
182 */
183
184 #ifdef ZFS_DEBUG
185 static void
186 dbuf_verify(dmu_buf_impl_t *db)
187 {
188 dnode_t *dn;
189 dbuf_dirty_record_t *dr;
190
191 ASSERT(MUTEX_HELD(&db->db_mtx));
192
193 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
194 return;
195
196 ASSERT(db->db_objset != NULL);
197 DB_DNODE_ENTER(db);
198 dn = DB_DNODE(db);
199 if (dn == NULL) {
200 ASSERT(db->db_parent == NULL);
201 ASSERT(db->db_blkptr == NULL);
202 } else {
203 ASSERT3U(db->db.db_object, ==, dn->dn_object);
204 ASSERT3P(db->db_objset, ==, dn->dn_objset);
205 ASSERT3U(db->db_level, <, dn->dn_nlevels);
206 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
207 db->db_blkid == DMU_SPILL_BLKID ||
208 !avl_is_empty(&dn->dn_dbufs));
209 }
210 if (db->db_blkid == DMU_BONUS_BLKID) {
211 ASSERT(dn != NULL);
212 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
213 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
214 } else if (db->db_blkid == DMU_SPILL_BLKID) {
215 ASSERT(dn != NULL);
216 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
217 ASSERT0(db->db.db_offset);
218 } else {
219 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
220 }
221
222 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
223 ASSERT(dr->dr_dbuf == db);
224
225 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
226 ASSERT(dr->dr_dbuf == db);
227
228 /*
229 * We can't assert that db_size matches dn_datablksz because it
230 * can be momentarily different when another thread is doing
231 * dnode_set_blksz().
232 */
233 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
234 dr = db->db_data_pending;
235 /*
236 * It should only be modified in syncing context, so
237 * make sure we only have one copy of the data.
238 */
239 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
240 }
241
242 /* verify db->db_blkptr */
243 if (db->db_blkptr) {
244 if (db->db_parent == dn->dn_dbuf) {
245 /* db is pointed to by the dnode */
246 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
247 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
248 ASSERT(db->db_parent == NULL);
249 else
250 ASSERT(db->db_parent != NULL);
251 if (db->db_blkid != DMU_SPILL_BLKID)
252 ASSERT3P(db->db_blkptr, ==,
253 &dn->dn_phys->dn_blkptr[db->db_blkid]);
254 } else {
255 /* db is pointed to by an indirect block */
256 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
257 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
258 ASSERT3U(db->db_parent->db.db_object, ==,
259 db->db.db_object);
260 /*
261 * dnode_grow_indblksz() can make this fail if we don't
262 * have the struct_rwlock. XXX indblksz no longer
263 * grows. safe to do this now?
264 */
265 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
266 ASSERT3P(db->db_blkptr, ==,
267 ((blkptr_t *)db->db_parent->db.db_data +
268 db->db_blkid % epb));
269 }
270 }
271 }
272 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
273 (db->db_buf == NULL || db->db_buf->b_data) &&
274 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
275 db->db_state != DB_FILL && !dn->dn_free_txg) {
276 /*
277 * If the blkptr isn't set but they have nonzero data,
278 * it had better be dirty, otherwise we'll lose that
279 * data when we evict this buffer.
280 */
281 if (db->db_dirtycnt == 0) {
282 uint64_t *buf = db->db.db_data;
283 int i;
284
285 for (i = 0; i < db->db.db_size >> 3; i++) {
286 ASSERT(buf[i] == 0);
287 }
288 }
289 }
290 DB_DNODE_EXIT(db);
291 }
292 #endif
293
294 static void
295 dbuf_update_data(dmu_buf_impl_t *db)
296 {
297 ASSERT(MUTEX_HELD(&db->db_mtx));
298 if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
299 ASSERT(!refcount_is_zero(&db->db_holds));
300 *db->db_user_data_ptr_ptr = db->db.db_data;
301 }
302 }
303
304 static void
305 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
306 {
307 ASSERT(MUTEX_HELD(&db->db_mtx));
308 db->db_buf = buf;
309 if (buf != NULL) {
310 ASSERT(buf->b_data != NULL);
311 db->db.db_data = buf->b_data;
312 if (!arc_released(buf))
313 arc_set_callback(buf, dbuf_do_evict, db);
314 dbuf_update_data(db);
315 } else {
316 dbuf_evict_user(db);
317 db->db.db_data = NULL;
318 if (db->db_state != DB_NOFILL)
319 db->db_state = DB_UNCACHED;
320 }
321 }
322
323 /*
324 * Loan out an arc_buf for read. Return the loaned arc_buf.
325 */
326 arc_buf_t *
327 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
328 {
329 arc_buf_t *abuf;
330
331 mutex_enter(&db->db_mtx);
332 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
333 int blksz = db->db.db_size;
334 spa_t *spa = db->db_objset->os_spa;
335
336 mutex_exit(&db->db_mtx);
337 abuf = arc_loan_buf(spa, blksz);
338 bcopy(db->db.db_data, abuf->b_data, blksz);
339 } else {
340 abuf = db->db_buf;
341 arc_loan_inuse_buf(abuf, db);
342 dbuf_set_data(db, NULL);
343 mutex_exit(&db->db_mtx);
344 }
345 return (abuf);
346 }
347
348 uint64_t
349 dbuf_whichblock(dnode_t *dn, uint64_t offset)
350 {
351 if (dn->dn_datablkshift) {
352 return (offset >> dn->dn_datablkshift);
353 } else {
354 ASSERT3U(offset, <, dn->dn_datablksz);
355 return (0);
356 }
357 }
358
359 static void
360 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
361 {
362 dmu_buf_impl_t *db = vdb;
363
364 mutex_enter(&db->db_mtx);
365 ASSERT3U(db->db_state, ==, DB_READ);
366 /*
367 * All reads are synchronous, so we must have a hold on the dbuf
368 */
369 ASSERT(refcount_count(&db->db_holds) > 0);
370 ASSERT(db->db_buf == NULL);
371 ASSERT(db->db.db_data == NULL);
372 if (db->db_level == 0 && db->db_freed_in_flight) {
373 /* we were freed in flight; disregard any error */
374 arc_release(buf, db);
375 bzero(buf->b_data, db->db.db_size);
376 arc_buf_freeze(buf);
377 db->db_freed_in_flight = FALSE;
378 dbuf_set_data(db, buf);
379 db->db_state = DB_CACHED;
380 } else if (zio == NULL || zio->io_error == 0) {
381 dbuf_set_data(db, buf);
382 db->db_state = DB_CACHED;
383 } else {
384 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
385 ASSERT3P(db->db_buf, ==, NULL);
386 VERIFY(arc_buf_remove_ref(buf, db));
387 db->db_state = DB_UNCACHED;
388 }
389 cv_broadcast(&db->db_changed);
390 dbuf_rele_and_unlock(db, NULL);
391 }
392
393 static void
394 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
395 {
396 dnode_t *dn;
397 zbookmark_phys_t zb;
398 uint32_t aflags = ARC_NOWAIT;
399
400 DB_DNODE_ENTER(db);
401 dn = DB_DNODE(db);
402 ASSERT(!refcount_is_zero(&db->db_holds));
403 /* We need the struct_rwlock to prevent db_blkptr from changing. */
404 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
405 ASSERT(MUTEX_HELD(&db->db_mtx));
406 ASSERT(db->db_state == DB_UNCACHED);
407 ASSERT(db->db_buf == NULL);
408
409 if (db->db_blkid == DMU_BONUS_BLKID) {
410 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
411
412 ASSERT3U(bonuslen, <=, db->db.db_size);
413 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
414 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
415 if (bonuslen < DN_MAX_BONUSLEN)
416 bzero(db->db.db_data, DN_MAX_BONUSLEN);
417 if (bonuslen)
418 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
419 DB_DNODE_EXIT(db);
420 dbuf_update_data(db);
421 db->db_state = DB_CACHED;
422 mutex_exit(&db->db_mtx);
423 return;
424 }
425
426 /*
427 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
428 * processes the delete record and clears the bp while we are waiting
429 * for the dn_mtx (resulting in a "no" from block_freed).
430 */
431 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
432 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
433 BP_IS_HOLE(db->db_blkptr)))) {
434 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
435
436 DB_DNODE_EXIT(db);
437 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
438 db->db.db_size, db, type));
439 bzero(db->db.db_data, db->db.db_size);
440 db->db_state = DB_CACHED;
441 *flags |= DB_RF_CACHED;
442 mutex_exit(&db->db_mtx);
443 return;
444 }
445
446 DB_DNODE_EXIT(db);
447
448 db->db_state = DB_READ;
449 mutex_exit(&db->db_mtx);
450
451 if (DBUF_IS_L2CACHEABLE(db))
452 aflags |= ARC_L2CACHE;
453 if (DBUF_IS_L2COMPRESSIBLE(db))
454 aflags |= ARC_L2COMPRESS;
455
456 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
457 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
458 db->db.db_object, db->db_level, db->db_blkid);
459
460 dbuf_add_ref(db, NULL);
461
462 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
463 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
464 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
465 &aflags, &zb);
466 if (aflags & ARC_CACHED)
467 *flags |= DB_RF_CACHED;
468 }
469
470 int
471 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
472 {
473 int err = 0;
474 boolean_t havepzio = (zio != NULL);
475 boolean_t prefetch;
476 dnode_t *dn;
477
478 /*
479 * We don't have to hold the mutex to check db_state because it
480 * can't be freed while we have a hold on the buffer.
481 */
482 ASSERT(!refcount_is_zero(&db->db_holds));
483
484 if (db->db_state == DB_NOFILL)
485 return (SET_ERROR(EIO));
486
487 DB_DNODE_ENTER(db);
488 dn = DB_DNODE(db);
489 if ((flags & DB_RF_HAVESTRUCT) == 0)
490 rw_enter(&dn->dn_struct_rwlock, RW_READER);
491
492 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
493 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
494 DBUF_IS_CACHEABLE(db);
495
496 mutex_enter(&db->db_mtx);
497 if (db->db_state == DB_CACHED) {
498 mutex_exit(&db->db_mtx);
499 if (prefetch)
500 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
501 db->db.db_size, TRUE);
502 if ((flags & DB_RF_HAVESTRUCT) == 0)
503 rw_exit(&dn->dn_struct_rwlock);
504 DB_DNODE_EXIT(db);
505 } else if (db->db_state == DB_UNCACHED) {
506 spa_t *spa = dn->dn_objset->os_spa;
507
508 if (zio == NULL)
509 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
510 dbuf_read_impl(db, zio, &flags);
511
512 /* dbuf_read_impl has dropped db_mtx for us */
513
514 if (prefetch)
515 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
516 db->db.db_size, flags & DB_RF_CACHED);
517
518 if ((flags & DB_RF_HAVESTRUCT) == 0)
519 rw_exit(&dn->dn_struct_rwlock);
520 DB_DNODE_EXIT(db);
521
522 if (!havepzio)
523 err = zio_wait(zio);
524 } else {
525 /*
526 * Another reader came in while the dbuf was in flight
527 * between UNCACHED and CACHED. Either a writer will finish
528 * writing the buffer (sending the dbuf to CACHED) or the
529 * first reader's request will reach the read_done callback
530 * and send the dbuf to CACHED. Otherwise, a failure
531 * occurred and the dbuf went to UNCACHED.
532 */
533 mutex_exit(&db->db_mtx);
534 if (prefetch)
535 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
536 db->db.db_size, TRUE);
537 if ((flags & DB_RF_HAVESTRUCT) == 0)
538 rw_exit(&dn->dn_struct_rwlock);
539 DB_DNODE_EXIT(db);
540
541 /* Skip the wait per the caller's request. */
542 mutex_enter(&db->db_mtx);
543 if ((flags & DB_RF_NEVERWAIT) == 0) {
544 while (db->db_state == DB_READ ||
545 db->db_state == DB_FILL) {
546 ASSERT(db->db_state == DB_READ ||
547 (flags & DB_RF_HAVESTRUCT) == 0);
548 cv_wait(&db->db_changed, &db->db_mtx);
549 }
550 if (db->db_state == DB_UNCACHED)
551 err = SET_ERROR(EIO);
552 }
553 mutex_exit(&db->db_mtx);
554 }
555
556 ASSERT(err || havepzio || db->db_state == DB_CACHED);
557 return (err);
558 }
559
560 static void
561 dbuf_noread(dmu_buf_impl_t *db)
562 {
563 ASSERT(!refcount_is_zero(&db->db_holds));
564 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
565 mutex_enter(&db->db_mtx);
566 while (db->db_state == DB_READ || db->db_state == DB_FILL)
567 cv_wait(&db->db_changed, &db->db_mtx);
568 if (db->db_state == DB_UNCACHED) {
569 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
570 spa_t *spa = db->db_objset->os_spa;
571
572 ASSERT(db->db_buf == NULL);
573 ASSERT(db->db.db_data == NULL);
574 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
575 db->db_state = DB_FILL;
576 } else if (db->db_state == DB_NOFILL) {
577 dbuf_set_data(db, NULL);
578 } else {
579 ASSERT3U(db->db_state, ==, DB_CACHED);
580 }
581 mutex_exit(&db->db_mtx);
582 }
583
584 /*
585 * This is our just-in-time copy function. It makes a copy of
586 * buffers, that have been modified in a previous transaction
587 * group, before we modify them in the current active group.
588 *
589 * This function is used in two places: when we are dirtying a
590 * buffer for the first time in a txg, and when we are freeing
591 * a range in a dnode that includes this buffer.
592 *
593 * Note that when we are called from dbuf_free_range() we do
594 * not put a hold on the buffer, we just traverse the active
595 * dbuf list for the dnode.
596 */
597 static void
598 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
599 {
600 dbuf_dirty_record_t *dr = db->db_last_dirty;
601
602 ASSERT(MUTEX_HELD(&db->db_mtx));
603 ASSERT(db->db.db_data != NULL);
604 ASSERT(db->db_level == 0);
605 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
606
607 if (dr == NULL ||
608 (dr->dt.dl.dr_data !=
609 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
610 return;
611
612 /*
613 * If the last dirty record for this dbuf has not yet synced
614 * and its referencing the dbuf data, either:
615 * reset the reference to point to a new copy,
616 * or (if there a no active holders)
617 * just null out the current db_data pointer.
618 */
619 ASSERT(dr->dr_txg >= txg - 2);
620 if (db->db_blkid == DMU_BONUS_BLKID) {
621 /* Note that the data bufs here are zio_bufs */
622 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
623 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
624 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
625 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
626 int size = db->db.db_size;
627 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
628 spa_t *spa = db->db_objset->os_spa;
629
630 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
631 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
632 } else {
633 dbuf_set_data(db, NULL);
634 }
635 }
636
637 void
638 dbuf_unoverride(dbuf_dirty_record_t *dr)
639 {
640 dmu_buf_impl_t *db = dr->dr_dbuf;
641 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
642 uint64_t txg = dr->dr_txg;
643
644 ASSERT(MUTEX_HELD(&db->db_mtx));
645 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
646 ASSERT(db->db_level == 0);
647
648 if (db->db_blkid == DMU_BONUS_BLKID ||
649 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
650 return;
651
652 ASSERT(db->db_data_pending != dr);
653
654 /* free this block */
655 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
656 zio_free(db->db_objset->os_spa, txg, bp);
657
658 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
659 dr->dt.dl.dr_nopwrite = B_FALSE;
660
661 /*
662 * Release the already-written buffer, so we leave it in
663 * a consistent dirty state. Note that all callers are
664 * modifying the buffer, so they will immediately do
665 * another (redundant) arc_release(). Therefore, leave
666 * the buf thawed to save the effort of freezing &
667 * immediately re-thawing it.
668 */
669 arc_release(dr->dt.dl.dr_data, db);
670 }
671
672 /*
673 * Evict (if its unreferenced) or clear (if its referenced) any level-0
674 * data blocks in the free range, so that any future readers will find
675 * empty blocks.
676 *
677 * This is a no-op if the dataset is in the middle of an incremental
678 * receive; see comment below for details.
679 */
680 void
681 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
682 dmu_tx_t *tx)
683 {
684 dmu_buf_impl_t *db, *db_next, db_search;
685 uint64_t txg = tx->tx_txg;
686 avl_index_t where;
687
688 if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
689 end_blkid = dn->dn_maxblkid;
690 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
691
692 db_search.db_level = 0;
693 db_search.db_blkid = start_blkid;
694 db_search.db_state = DB_SEARCH;
695
696 mutex_enter(&dn->dn_dbufs_mtx);
697 if (start_blkid >= dn->dn_unlisted_l0_blkid) {
698 /* There can't be any dbufs in this range; no need to search. */
699 #ifdef DEBUG
700 db = avl_find(&dn->dn_dbufs, &db_search, &where);
701 ASSERT3P(db, ==, NULL);
702 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
703 ASSERT(db == NULL || db->db_level > 0);
704 #endif
705 mutex_exit(&dn->dn_dbufs_mtx);
706 return;
707 } else if (dmu_objset_is_receiving(dn->dn_objset)) {
708 /*
709 * If we are receiving, we expect there to be no dbufs in
710 * the range to be freed, because receive modifies each
711 * block at most once, and in offset order. If this is
712 * not the case, it can lead to performance problems,
713 * so note that we unexpectedly took the slow path.
714 */
715 atomic_inc_64(&zfs_free_range_recv_miss);
716 }
717
718 db = avl_find(&dn->dn_dbufs, &db_search, &where);
719 ASSERT3P(db, ==, NULL);
720 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
721
722 for (; db != NULL; db = db_next) {
723 db_next = AVL_NEXT(&dn->dn_dbufs, db);
724 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
725
726 if (db->db_level != 0 || db->db_blkid > end_blkid) {
727 break;
728 }
729 ASSERT3U(db->db_blkid, >=, start_blkid);
730
731 /* found a level 0 buffer in the range */
732 mutex_enter(&db->db_mtx);
733 if (dbuf_undirty(db, tx)) {
734 /* mutex has been dropped and dbuf destroyed */
735 continue;
736 }
737
738 if (db->db_state == DB_UNCACHED ||
739 db->db_state == DB_NOFILL ||
740 db->db_state == DB_EVICTING) {
741 ASSERT(db->db.db_data == NULL);
742 mutex_exit(&db->db_mtx);
743 continue;
744 }
745 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
746 /* will be handled in dbuf_read_done or dbuf_rele */
747 db->db_freed_in_flight = TRUE;
748 mutex_exit(&db->db_mtx);
749 continue;
750 }
751 if (refcount_count(&db->db_holds) == 0) {
752 ASSERT(db->db_buf);
753 dbuf_clear(db);
754 continue;
755 }
756 /* The dbuf is referenced */
757
758 if (db->db_last_dirty != NULL) {
759 dbuf_dirty_record_t *dr = db->db_last_dirty;
760
761 if (dr->dr_txg == txg) {
762 /*
763 * This buffer is "in-use", re-adjust the file
764 * size to reflect that this buffer may
765 * contain new data when we sync.
766 */
767 if (db->db_blkid != DMU_SPILL_BLKID &&
768 db->db_blkid > dn->dn_maxblkid)
769 dn->dn_maxblkid = db->db_blkid;
770 dbuf_unoverride(dr);
771 } else {
772 /*
773 * This dbuf is not dirty in the open context.
774 * Either uncache it (if its not referenced in
775 * the open context) or reset its contents to
776 * empty.
777 */
778 dbuf_fix_old_data(db, txg);
779 }
780 }
781 /* clear the contents if its cached */
782 if (db->db_state == DB_CACHED) {
783 ASSERT(db->db.db_data != NULL);
784 arc_release(db->db_buf, db);
785 bzero(db->db.db_data, db->db.db_size);
786 arc_buf_freeze(db->db_buf);
787 }
788
789 mutex_exit(&db->db_mtx);
790 }
791 mutex_exit(&dn->dn_dbufs_mtx);
792 }
793
794 static int
795 dbuf_block_freeable(dmu_buf_impl_t *db)
796 {
797 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
798 uint64_t birth_txg = 0;
799
800 /*
801 * We don't need any locking to protect db_blkptr:
802 * If it's syncing, then db_last_dirty will be set
803 * so we'll ignore db_blkptr.
804 *
805 * This logic ensures that only block births for
806 * filled blocks are considered.
807 */
808 ASSERT(MUTEX_HELD(&db->db_mtx));
809 if (db->db_last_dirty && (db->db_blkptr == NULL ||
810 !BP_IS_HOLE(db->db_blkptr))) {
811 birth_txg = db->db_last_dirty->dr_txg;
812 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
813 birth_txg = db->db_blkptr->blk_birth;
814 }
815
816 /*
817 * If this block don't exist or is in a snapshot, it can't be freed.
818 * Don't pass the bp to dsl_dataset_block_freeable() since we
819 * are holding the db_mtx lock and might deadlock if we are
820 * prefetching a dedup-ed block.
821 */
822 if (birth_txg != 0)
823 return (ds == NULL ||
824 dsl_dataset_block_freeable(ds, NULL, birth_txg));
825 else
826 return (B_FALSE);
827 }
828
829 void
830 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
831 {
832 arc_buf_t *buf, *obuf;
833 int osize = db->db.db_size;
834 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
835 dnode_t *dn;
836
837 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
838
839 DB_DNODE_ENTER(db);
840 dn = DB_DNODE(db);
841
842 /* XXX does *this* func really need the lock? */
843 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
844
845 /*
846 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
847 * is OK, because there can be no other references to the db
848 * when we are changing its size, so no concurrent DB_FILL can
849 * be happening.
850 */
851 /*
852 * XXX we should be doing a dbuf_read, checking the return
853 * value and returning that up to our callers
854 */
855 dmu_buf_will_dirty(&db->db, tx);
856
857 /* create the data buffer for the new block */
858 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
859
860 /* copy old block data to the new block */
861 obuf = db->db_buf;
862 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
863 /* zero the remainder */
864 if (size > osize)
865 bzero((uint8_t *)buf->b_data + osize, size - osize);
866
867 mutex_enter(&db->db_mtx);
868 dbuf_set_data(db, buf);
869 VERIFY(arc_buf_remove_ref(obuf, db));
870 db->db.db_size = size;
871
872 if (db->db_level == 0) {
873 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
874 db->db_last_dirty->dt.dl.dr_data = buf;
875 }
876 mutex_exit(&db->db_mtx);
877
878 dnode_willuse_space(dn, size-osize, tx);
879 DB_DNODE_EXIT(db);
880 }
881
882 void
883 dbuf_release_bp(dmu_buf_impl_t *db)
884 {
885 objset_t *os = db->db_objset;
886
887 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
888 ASSERT(arc_released(os->os_phys_buf) ||
889 list_link_active(&os->os_dsl_dataset->ds_synced_link));
890 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
891
892 (void) arc_release(db->db_buf, db);
893 }
894
895 dbuf_dirty_record_t *
896 dbuf_dirty_sc(dmu_buf_impl_t *db, dmu_tx_t *tx, boolean_t usesc)
897 {
898 dnode_t *dn;
899 objset_t *os;
900 dbuf_dirty_record_t **drp, *dr;
901 int drop_struct_lock = FALSE;
902 boolean_t do_free_accounting = B_FALSE;
903 int txgoff = tx->tx_txg & TXG_MASK;
904
905 ASSERT(tx->tx_txg != 0);
906 ASSERT(!refcount_is_zero(&db->db_holds));
907 DMU_TX_DIRTY_BUF(tx, db);
908
909 DB_DNODE_ENTER(db);
910 dn = DB_DNODE(db);
911 /*
912 * Shouldn't dirty a regular buffer in syncing context. Private
913 * objects may be dirtied in syncing context, but only if they
914 * were already pre-dirtied in open context.
915 */
916 ASSERT(!dmu_tx_is_syncing(tx) ||
917 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
918 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
919 dn->dn_objset->os_dsl_dataset == NULL);
920 /*
921 * We make this assert for private objects as well, but after we
922 * check if we're already dirty. They are allowed to re-dirty
923 * in syncing context.
924 */
925 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
926 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
927 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
928
929 mutex_enter(&db->db_mtx);
930 /*
931 * XXX make this true for indirects too? The problem is that
932 * transactions created with dmu_tx_create_assigned() from
933 * syncing context don't bother holding ahead.
934 */
935 ASSERT(db->db_level != 0 ||
936 db->db_state == DB_CACHED || db->db_state == DB_FILL ||
937 db->db_state == DB_NOFILL);
938
939 mutex_enter(&dn->dn_mtx);
940 /*
941 * Don't set dirtyctx to SYNC if we're just modifying this as we
942 * initialize the objset.
943 */
944 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
945 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
946 dn->dn_dirtyctx =
947 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
948 ASSERT(dn->dn_dirtyctx_firstset == NULL);
949 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
950 }
951 mutex_exit(&dn->dn_mtx);
952
953 if (db->db_blkid == DMU_SPILL_BLKID)
954 dn->dn_have_spill = B_TRUE;
955
956 /*
957 * If this buffer is already dirty, we're done.
958 */
959 drp = &db->db_last_dirty;
960 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
961 db->db.db_object == DMU_META_DNODE_OBJECT);
962 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
963 drp = &dr->dr_next;
964 if (dr && dr->dr_txg == tx->tx_txg) {
965 DB_DNODE_EXIT(db);
966
967 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
968 /*
969 * If this buffer has already been written out,
970 * we now need to reset its state.
971 */
972 dbuf_unoverride(dr);
973 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
974 db->db_state != DB_NOFILL)
975 arc_buf_thaw(db->db_buf);
976 }
977
978 /*
979 * Special class usage of dirty dbuf could be changed,
980 * update the dirty entry.
981 */
982 dr->dr_usesc = usesc;
983 mutex_exit(&db->db_mtx);
984 return (dr);
985 }
986
987 /*
988 * Only valid if not already dirty.
989 */
990 ASSERT(dn->dn_object == 0 ||
991 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
992 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
993
994 ASSERT3U(dn->dn_nlevels, >, db->db_level);
995 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
996 dn->dn_phys->dn_nlevels > db->db_level ||
997 dn->dn_next_nlevels[txgoff] > db->db_level ||
998 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
999 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1000
1001 /*
1002 * We should only be dirtying in syncing context if it's the
1003 * mos or we're initializing the os or it's a special object.
1004 * However, we are allowed to dirty in syncing context provided
1005 * we already dirtied it in open context. Hence we must make
1006 * this assertion only if we're not already dirty.
1007 */
1008 os = dn->dn_objset;
1009 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1010 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1011 ASSERT(db->db.db_size != 0);
1012
1013 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1014
1015 if (db->db_blkid != DMU_BONUS_BLKID) {
1016 /*
1017 * Update the accounting.
1018 * Note: we delay "free accounting" until after we drop
1019 * the db_mtx. This keeps us from grabbing other locks
1020 * (and possibly deadlocking) in bp_get_dsize() while
1021 * also holding the db_mtx.
1022 */
1023 dnode_willuse_space(dn, db->db.db_size, tx);
1024 do_free_accounting = dbuf_block_freeable(db);
1025 }
1026
1027 /*
1028 * If this buffer is dirty in an old transaction group we need
1029 * to make a copy of it so that the changes we make in this
1030 * transaction group won't leak out when we sync the older txg.
1031 */
1032 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1033 if (db->db_level == 0) {
1034 void *data_old = db->db_buf;
1035
1036 if (db->db_state != DB_NOFILL) {
1037 if (db->db_blkid == DMU_BONUS_BLKID) {
1038 dbuf_fix_old_data(db, tx->tx_txg);
1039 data_old = db->db.db_data;
1040 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1041 /*
1042 * Release the data buffer from the cache so
1043 * that we can modify it without impacting
1044 * possible other users of this cached data
1045 * block. Note that indirect blocks and
1046 * private objects are not released until the
1047 * syncing state (since they are only modified
1048 * then).
1049 */
1050 arc_release(db->db_buf, db);
1051 dbuf_fix_old_data(db, tx->tx_txg);
1052 data_old = db->db_buf;
1053 }
1054 ASSERT(data_old != NULL);
1055 }
1056 dr->dt.dl.dr_data = data_old;
1057 } else {
1058 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1059 list_create(&dr->dt.di.dr_children,
1060 sizeof (dbuf_dirty_record_t),
1061 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1062 }
1063 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1064 dr->dr_accounted = db->db.db_size;
1065 dr->dr_dbuf = db;
1066 dr->dr_txg = tx->tx_txg;
1067 dr->dr_next = *drp;
1068 dr->dr_usesc = usesc;
1069 *drp = dr;
1070
1071 /*
1072 * We could have been freed_in_flight between the dbuf_noread
1073 * and dbuf_dirty. We win, as though the dbuf_noread() had
1074 * happened after the free.
1075 */
1076 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1077 db->db_blkid != DMU_SPILL_BLKID) {
1078 mutex_enter(&dn->dn_mtx);
1079 if (dn->dn_free_ranges[txgoff] != NULL) {
1080 range_tree_clear(dn->dn_free_ranges[txgoff],
1081 db->db_blkid, 1);
1082 }
1083 mutex_exit(&dn->dn_mtx);
1084 db->db_freed_in_flight = FALSE;
1085 }
1086
1087 /*
1088 * This buffer is now part of this txg
1089 */
1090 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1091 db->db_dirtycnt += 1;
1092 ASSERT3U(db->db_dirtycnt, <=, 3);
1093
1094 mutex_exit(&db->db_mtx);
1095
1096 if (db->db_blkid == DMU_BONUS_BLKID ||
1097 db->db_blkid == DMU_SPILL_BLKID) {
1098 mutex_enter(&dn->dn_mtx);
1099 ASSERT(!list_link_active(&dr->dr_dirty_node));
1100 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1101 mutex_exit(&dn->dn_mtx);
1102 dnode_setdirty_sc(dn, tx, usesc);
1103 DB_DNODE_EXIT(db);
1104 return (dr);
1105 } else if (do_free_accounting) {
1106 blkptr_t *bp = db->db_blkptr;
1107 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1108 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1109 /*
1110 * This is only a guess -- if the dbuf is dirty
1111 * in a previous txg, we don't know how much
1112 * space it will use on disk yet. We should
1113 * really have the struct_rwlock to access
1114 * db_blkptr, but since this is just a guess,
1115 * it's OK if we get an odd answer.
1116 */
1117 ddt_prefetch(os->os_spa, bp);
1118 dnode_willuse_space(dn, -willfree, tx);
1119 }
1120
1121 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1122 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1123 drop_struct_lock = TRUE;
1124 }
1125
1126 if (db->db_level == 0) {
1127 dnode_new_blkid(dn, db->db_blkid, tx, usesc, drop_struct_lock);
1128 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1129 }
1130
1131 if (db->db_level+1 < dn->dn_nlevels) {
1132 dmu_buf_impl_t *parent = db->db_parent;
1133 dbuf_dirty_record_t *di;
1134 int parent_held = FALSE;
1135
1136 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1137 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1138
1139 parent = dbuf_hold_level(dn, db->db_level+1,
1140 db->db_blkid >> epbs, FTAG);
1141 ASSERT(parent != NULL);
1142 parent_held = TRUE;
1143 }
1144 if (drop_struct_lock)
1145 rw_exit(&dn->dn_struct_rwlock);
1146 ASSERT3U(db->db_level+1, ==, parent->db_level);
1147 di = dbuf_dirty_sc(parent, tx, usesc);
1148 if (parent_held)
1149 dbuf_rele(parent, FTAG);
1150
1151 mutex_enter(&db->db_mtx);
1152 /*
1153 * Since we've dropped the mutex, it's possible that
1154 * dbuf_undirty() might have changed this out from under us.
1155 */
1156 if (db->db_last_dirty == dr ||
1157 dn->dn_object == DMU_META_DNODE_OBJECT) {
1158 mutex_enter(&di->dt.di.dr_mtx);
1159 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1160 ASSERT(!list_link_active(&dr->dr_dirty_node));
1161 list_insert_tail(&di->dt.di.dr_children, dr);
1162 mutex_exit(&di->dt.di.dr_mtx);
1163 dr->dr_parent = di;
1164 }
1165
1166 /*
1167 * Special class usage of dirty dbuf could be changed,
1168 * update the dirty entry.
1169 */
1170 dr->dr_usesc = usesc;
1171 mutex_exit(&db->db_mtx);
1172 } else {
1173 ASSERT(db->db_level+1 == dn->dn_nlevels);
1174 ASSERT(db->db_blkid < dn->dn_nblkptr);
1175 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1176 mutex_enter(&dn->dn_mtx);
1177 ASSERT(!list_link_active(&dr->dr_dirty_node));
1178 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1179 mutex_exit(&dn->dn_mtx);
1180 if (drop_struct_lock)
1181 rw_exit(&dn->dn_struct_rwlock);
1182 }
1183
1184 dnode_setdirty_sc(dn, tx, usesc);
1185 DB_DNODE_EXIT(db);
1186 return (dr);
1187 }
1188
1189 dbuf_dirty_record_t *
1190 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1191 {
1192 spa_t *spa;
1193
1194 ASSERT(db->db_objset != NULL);
1195 spa = db->db_objset->os_spa;
1196
1197 return (dbuf_dirty_sc(db, tx, spa->spa_usesc));
1198 }
1199
1200 /*
1201 * Undirty a buffer in the transaction group referenced by the given
1202 * transaction. Return whether this evicted the dbuf.
1203 */
1204 static boolean_t
1205 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1206 {
1207 dnode_t *dn;
1208 uint64_t txg = tx->tx_txg;
1209 dbuf_dirty_record_t *dr, **drp;
1210
1211 ASSERT(txg != 0);
1212 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1213 ASSERT0(db->db_level);
1214 ASSERT(MUTEX_HELD(&db->db_mtx));
1215
1216 /*
1217 * If this buffer is not dirty, we're done.
1218 */
1219 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1220 if (dr->dr_txg <= txg)
1221 break;
1222 if (dr == NULL || dr->dr_txg < txg)
1223 return (B_FALSE);
1224 ASSERT(dr->dr_txg == txg);
1225 ASSERT(dr->dr_dbuf == db);
1226
1227 DB_DNODE_ENTER(db);
1228 dn = DB_DNODE(db);
1229
1230 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1231
1232 ASSERT(db->db.db_size != 0);
1233
1234 /*
1235 * Any space we accounted for in dp_dirty_* will be cleaned up by
1236 * dsl_pool_sync(). This is relatively rare so the discrepancy
1237 * is not a big deal.
1238 */
1239
1240 *drp = dr->dr_next;
1241
1242 /*
1243 * Note that there are three places in dbuf_dirty()
1244 * where this dirty record may be put on a list.
1245 * Make sure to do a list_remove corresponding to
1246 * every one of those list_insert calls.
1247 */
1248 if (dr->dr_parent) {
1249 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1250 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1251 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1252 } else if (db->db_blkid == DMU_SPILL_BLKID ||
1253 db->db_level+1 == dn->dn_nlevels) {
1254 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1255 mutex_enter(&dn->dn_mtx);
1256 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1257 mutex_exit(&dn->dn_mtx);
1258 }
1259 DB_DNODE_EXIT(db);
1260
1261 if (db->db_state != DB_NOFILL) {
1262 dbuf_unoverride(dr);
1263
1264 ASSERT(db->db_buf != NULL);
1265 ASSERT(dr->dt.dl.dr_data != NULL);
1266 if (dr->dt.dl.dr_data != db->db_buf)
1267 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1268 }
1269
1270 if (db->db_level != 0) {
1271 mutex_destroy(&dr->dt.di.dr_mtx);
1272 list_destroy(&dr->dt.di.dr_children);
1273 }
1274
1275 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1276
1277 ASSERT(db->db_dirtycnt > 0);
1278 db->db_dirtycnt -= 1;
1279
1280 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1281 arc_buf_t *buf = db->db_buf;
1282
1283 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1284 dbuf_set_data(db, NULL);
1285 VERIFY(arc_buf_remove_ref(buf, db));
1286 dbuf_evict(db);
1287 return (B_TRUE);
1288 }
1289
1290 return (B_FALSE);
1291 }
1292
1293 void
1294 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1295 {
1296 dmu_buf_will_dirty_sc(db_fake, tx, B_TRUE);
1297 }
1298
1299 void
1300 dmu_buf_will_dirty_sc(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t usesc)
1301 {
1302 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1303 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1304
1305 ASSERT(tx->tx_txg != 0);
1306 ASSERT(!refcount_is_zero(&db->db_holds));
1307
1308 DB_DNODE_ENTER(db);
1309 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1310 rf |= DB_RF_HAVESTRUCT;
1311 DB_DNODE_EXIT(db);
1312 (void) dbuf_read(db, NULL, rf);
1313 (void) dbuf_dirty_sc(db, tx, usesc);
1314 }
1315
1316
1317 void
1318 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1319 {
1320 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1321
1322 db->db_state = DB_NOFILL;
1323
1324 dmu_buf_will_fill(db_fake, tx);
1325 }
1326
1327 void
1328 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1329 {
1330 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1331
1332 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1333 ASSERT(tx->tx_txg != 0);
1334 ASSERT(db->db_level == 0);
1335 ASSERT(!refcount_is_zero(&db->db_holds));
1336
1337 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1338 dmu_tx_private_ok(tx));
1339
1340 dbuf_noread(db);
1341 (void) dbuf_dirty(db, tx);
1342 }
1343
1344 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1345 /* ARGSUSED */
1346 void
1347 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1348 {
1349 mutex_enter(&db->db_mtx);
1350 DBUF_VERIFY(db);
1351
1352 if (db->db_state == DB_FILL) {
1353 if (db->db_level == 0 && db->db_freed_in_flight) {
1354 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1355 /* we were freed while filling */
1356 /* XXX dbuf_undirty? */
1357 bzero(db->db.db_data, db->db.db_size);
1358 db->db_freed_in_flight = FALSE;
1359 }
1360 db->db_state = DB_CACHED;
1361 cv_broadcast(&db->db_changed);
1362 }
1363 mutex_exit(&db->db_mtx);
1364 }
1365
1366 void
1367 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
1368 bp_embedded_type_t etype, enum zio_compress comp,
1369 int uncompressed_size, int compressed_size, int byteorder,
1370 dmu_tx_t *tx)
1371 {
1372 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
1373 struct dirty_leaf *dl;
1374 dmu_object_type_t type;
1375
1376 DB_DNODE_ENTER(db);
1377 type = DB_DNODE(db)->dn_type;
1378 DB_DNODE_EXIT(db);
1379
1380 ASSERT0(db->db_level);
1381 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1382
1383 dmu_buf_will_not_fill(dbuf, tx);
1384
1385 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1386 dl = &db->db_last_dirty->dt.dl;
1387 encode_embedded_bp_compressed(&dl->dr_overridden_by,
1388 data, comp, uncompressed_size, compressed_size);
1389 BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
1390 BP_SET_TYPE(&dl->dr_overridden_by, type);
1391 BP_SET_LEVEL(&dl->dr_overridden_by, 0);
1392 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
1393
1394 dl->dr_override_state = DR_OVERRIDDEN;
1395 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
1396 }
1397
1398 /*
1399 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1400 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1401 */
1402 void
1403 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1404 {
1405 ASSERT(!refcount_is_zero(&db->db_holds));
1406 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1407 ASSERT(db->db_level == 0);
1408 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1409 ASSERT(buf != NULL);
1410 ASSERT(arc_buf_size(buf) == db->db.db_size);
1411 ASSERT(tx->tx_txg != 0);
1412
1413 arc_return_buf(buf, db);
1414 ASSERT(arc_released(buf));
1415
1416 mutex_enter(&db->db_mtx);
1417
1418 while (db->db_state == DB_READ || db->db_state == DB_FILL)
1419 cv_wait(&db->db_changed, &db->db_mtx);
1420
1421 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1422
1423 if (db->db_state == DB_CACHED &&
1424 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1425 mutex_exit(&db->db_mtx);
1426 (void) dbuf_dirty(db, tx);
1427 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1428 VERIFY(arc_buf_remove_ref(buf, db));
1429 xuio_stat_wbuf_copied();
1430 return;
1431 }
1432
1433 xuio_stat_wbuf_nocopy();
1434 if (db->db_state == DB_CACHED) {
1435 dbuf_dirty_record_t *dr = db->db_last_dirty;
1436
1437 ASSERT(db->db_buf != NULL);
1438 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1439 ASSERT(dr->dt.dl.dr_data == db->db_buf);
1440 if (!arc_released(db->db_buf)) {
1441 ASSERT(dr->dt.dl.dr_override_state ==
1442 DR_OVERRIDDEN);
1443 arc_release(db->db_buf, db);
1444 }
1445 dr->dt.dl.dr_data = buf;
1446 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1447 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1448 arc_release(db->db_buf, db);
1449 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1450 }
1451 db->db_buf = NULL;
1452 }
1453 ASSERT(db->db_buf == NULL);
1454 dbuf_set_data(db, buf);
1455 db->db_state = DB_FILL;
1456 mutex_exit(&db->db_mtx);
1457 (void) dbuf_dirty(db, tx);
1458 dmu_buf_fill_done(&db->db, tx);
1459 }
1460
1461 /*
1462 * "Clear" the contents of this dbuf. This will mark the dbuf
1463 * EVICTING and clear *most* of its references. Unfortunately,
1464 * when we are not holding the dn_dbufs_mtx, we can't clear the
1465 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1466 * in this case. For callers from the DMU we will usually see:
1467 * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
1468 * For the arc callback, we will usually see:
1469 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1470 * Sometimes, though, we will get a mix of these two:
1471 * DMU: dbuf_clear()->arc_clear_callback()
1472 * ARC: dbuf_do_evict()->dbuf_destroy()
1473 *
1474 * This routine will dissociate the dbuf from the arc, by calling
1475 * arc_clear_callback(), but will not evict the data from the ARC.
1476 */
1477 void
1478 dbuf_clear(dmu_buf_impl_t *db)
1479 {
1480 dnode_t *dn;
1481 dmu_buf_impl_t *parent = db->db_parent;
1482 dmu_buf_impl_t *dndb;
1483 boolean_t dbuf_gone = B_FALSE;
1484
1485 ASSERT(MUTEX_HELD(&db->db_mtx));
1486 ASSERT(refcount_is_zero(&db->db_holds));
1487
1488 dbuf_evict_user(db);
1489
1490 if (db->db_state == DB_CACHED) {
1491 ASSERT(db->db.db_data != NULL);
1492 if (db->db_blkid == DMU_BONUS_BLKID) {
1493 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1494 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1495 }
1496 db->db.db_data = NULL;
1497 db->db_state = DB_UNCACHED;
1498 }
1499
1500 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1501 ASSERT(db->db_data_pending == NULL);
1502
1503 db->db_state = DB_EVICTING;
1504 db->db_blkptr = NULL;
1505
1506 DB_DNODE_ENTER(db);
1507 dn = DB_DNODE(db);
1508 dndb = dn->dn_dbuf;
1509 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1510 avl_remove(&dn->dn_dbufs, db);
1511 atomic_dec_32(&dn->dn_dbufs_count);
1512 membar_producer();
1513 DB_DNODE_EXIT(db);
1514 /*
1515 * Decrementing the dbuf count means that the hold corresponding
1516 * to the removed dbuf is no longer discounted in dnode_move(),
1517 * so the dnode cannot be moved until after we release the hold.
1518 * The membar_producer() ensures visibility of the decremented
1519 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1520 * release any lock.
1521 */
1522 dnode_rele(dn, db);
1523 db->db_dnode_handle = NULL;
1524 } else {
1525 DB_DNODE_EXIT(db);
1526 }
1527
1528 if (db->db_buf)
1529 dbuf_gone = arc_clear_callback(db->db_buf);
1530
1531 if (!dbuf_gone)
1532 mutex_exit(&db->db_mtx);
1533
1534 /*
1535 * If this dbuf is referenced from an indirect dbuf,
1536 * decrement the ref count on the indirect dbuf.
1537 */
1538 if (parent && parent != dndb)
1539 dbuf_rele(parent, db);
1540 }
1541
1542 static int
1543 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1544 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1545 {
1546 int nlevels, epbs;
1547
1548 *parentp = NULL;
1549 *bpp = NULL;
1550
1551 ASSERT(blkid != DMU_BONUS_BLKID);
1552
1553 if (blkid == DMU_SPILL_BLKID) {
1554 mutex_enter(&dn->dn_mtx);
1555 if (dn->dn_have_spill &&
1556 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1557 *bpp = &dn->dn_phys->dn_spill;
1558 else
1559 *bpp = NULL;
1560 dbuf_add_ref(dn->dn_dbuf, NULL);
1561 *parentp = dn->dn_dbuf;
1562 mutex_exit(&dn->dn_mtx);
1563 return (0);
1564 }
1565
1566 if (dn->dn_phys->dn_nlevels == 0)
1567 nlevels = 1;
1568 else
1569 nlevels = dn->dn_phys->dn_nlevels;
1570
1571 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1572
1573 ASSERT3U(level * epbs, <, 64);
1574 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1575 if (level >= nlevels ||
1576 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1577 /* the buffer has no parent yet */
1578 return (SET_ERROR(ENOENT));
1579 } else if (level < nlevels-1) {
1580 /* this block is referenced from an indirect block */
1581 int err = dbuf_hold_impl(dn, level+1,
1582 blkid >> epbs, fail_sparse, NULL, parentp);
1583 if (err)
1584 return (err);
1585 err = dbuf_read(*parentp, NULL,
1586 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1587 if (err) {
1588 dbuf_rele(*parentp, NULL);
1589 *parentp = NULL;
1590 return (err);
1591 }
1592 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1593 (blkid & ((1ULL << epbs) - 1));
1594 return (0);
1595 } else {
1596 /* the block is referenced from the dnode */
1597 ASSERT3U(level, ==, nlevels-1);
1598 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1599 blkid < dn->dn_phys->dn_nblkptr);
1600 if (dn->dn_dbuf) {
1601 dbuf_add_ref(dn->dn_dbuf, NULL);
1602 *parentp = dn->dn_dbuf;
1603 }
1604 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1605 return (0);
1606 }
1607 }
1608
1609 static dmu_buf_impl_t *
1610 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1611 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1612 {
1613 objset_t *os = dn->dn_objset;
1614 dmu_buf_impl_t *db, *odb;
1615 avl_index_t where;
1616
1617 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1618 ASSERT(dn->dn_type != DMU_OT_NONE);
1619
1620 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1621
1622 db->db_objset = os;
1623 db->db.db_object = dn->dn_object;
1624 db->db_level = level;
1625 db->db_blkid = blkid;
1626 db->db_last_dirty = NULL;
1627 db->db_dirtycnt = 0;
1628 db->db_dnode_handle = dn->dn_handle;
1629 db->db_parent = parent;
1630 db->db_blkptr = blkptr;
1631
1632 db->db_user_ptr = NULL;
1633 db->db_user_data_ptr_ptr = NULL;
1634 db->db_evict_func = NULL;
1635 db->db_immediate_evict = 0;
1636 db->db_freed_in_flight = 0;
1637
1638 if (blkid == DMU_BONUS_BLKID) {
1639 ASSERT3P(parent, ==, dn->dn_dbuf);
1640 db->db.db_size = DN_MAX_BONUSLEN -
1641 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1642 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1643 db->db.db_offset = DMU_BONUS_BLKID;
1644 db->db_state = DB_UNCACHED;
1645 /* the bonus dbuf is not placed into the dnode's dbuf tree */
1646 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1647 return (db);
1648 } else if (blkid == DMU_SPILL_BLKID) {
1649 db->db.db_size = (blkptr != NULL) ?
1650 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1651 db->db.db_offset = 0;
1652 } else {
1653 int blocksize =
1654 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1655 db->db.db_size = blocksize;
1656 db->db.db_offset = db->db_blkid * blocksize;
1657 }
1658
1659 mutex_enter(&dn->dn_dbufs_mtx);
1660 mutex_enter(&db->db_mtx);
1661 db->db_state = DB_EVICTING;
1662 if ((odb = avl_find(&dn->dn_dbufs, db, &where))) {
1663 /* someone else inserted it first */
1664 mutex_exit(&db->db_mtx);
1665 kmem_cache_free(dbuf_cache, db);
1666 mutex_enter(&odb->db_mtx);
1667 mutex_exit(&dn->dn_dbufs_mtx);
1668 return (odb);
1669 }
1670 avl_insert(&dn->dn_dbufs, db, where);
1671 if (db->db_level == 0 && db->db_blkid >=
1672 dn->dn_unlisted_l0_blkid)
1673 dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1674 db->db_state = DB_UNCACHED;
1675 mutex_exit(&dn->dn_dbufs_mtx);
1676 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1677
1678 if (parent && parent != dn->dn_dbuf)
1679 dbuf_add_ref(parent, db);
1680
1681 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1682 refcount_count(&dn->dn_holds) > 0);
1683 (void) refcount_add(&dn->dn_holds, db);
1684 atomic_inc_32(&dn->dn_dbufs_count);
1685
1686 dprintf_dbuf(db, "db=%p\n", db);
1687
1688 return (db);
1689 }
1690
1691 static int
1692 dbuf_do_evict(void *private)
1693 {
1694 dmu_buf_impl_t *db = private;
1695
1696 if (!MUTEX_HELD(&db->db_mtx))
1697 mutex_enter(&db->db_mtx);
1698
1699 ASSERT(refcount_is_zero(&db->db_holds));
1700
1701 if (db->db_state != DB_EVICTING) {
1702 ASSERT(db->db_state == DB_CACHED);
1703 DBUF_VERIFY(db);
1704 db->db_buf = NULL;
1705 dbuf_evict(db);
1706 } else {
1707 mutex_exit(&db->db_mtx);
1708 dbuf_destroy(db);
1709 }
1710 return (0);
1711 }
1712
1713 static void
1714 dbuf_destroy(dmu_buf_impl_t *db)
1715 {
1716 ASSERT(refcount_is_zero(&db->db_holds));
1717
1718 if (db->db_blkid != DMU_BONUS_BLKID) {
1719 /*
1720 * If this dbuf is still on the dn_dbufs list,
1721 * remove it from that list.
1722 */
1723 if (db->db_dnode_handle != NULL) {
1724 dnode_t *dn;
1725
1726 DB_DNODE_ENTER(db);
1727 dn = DB_DNODE(db);
1728 mutex_enter(&dn->dn_dbufs_mtx);
1729 avl_remove(&dn->dn_dbufs, db);
1730 atomic_dec_32(&dn->dn_dbufs_count);
1731 mutex_exit(&dn->dn_dbufs_mtx);
1732 DB_DNODE_EXIT(db);
1733 /*
1734 * Decrementing the dbuf count means that the hold
1735 * corresponding to the removed dbuf is no longer
1736 * discounted in dnode_move(), so the dnode cannot be
1737 * moved until after we release the hold.
1738 */
1739 dnode_rele(dn, db);
1740 db->db_dnode_handle = NULL;
1741 }
1742 }
1743 db->db_parent = NULL;
1744 db->db_buf = NULL;
1745
1746 ASSERT(db->db.db_data == NULL);
1747 ASSERT(db->db_blkptr == NULL);
1748 ASSERT(db->db_data_pending == NULL);
1749
1750 kmem_cache_free(dbuf_cache, db);
1751 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1752 }
1753
1754 void
1755 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1756 {
1757 dmu_buf_impl_t *db = NULL;
1758 blkptr_t *bp = NULL;
1759
1760 ASSERT(blkid != DMU_BONUS_BLKID);
1761 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1762
1763 if (dnode_block_freed(dn, blkid))
1764 return;
1765
1766 /* dbuf_find() returns with db_mtx held */
1767 if (db = dbuf_find(dn, 0, blkid)) {
1768 /*
1769 * This dbuf is already in the cache. We assume that
1770 * it is already CACHED, or else about to be either
1771 * read or filled.
1772 */
1773 mutex_exit(&db->db_mtx);
1774 return;
1775 }
1776
1777 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1778 if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
1779 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1780 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1781 zbookmark_phys_t zb;
1782
1783 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1784 dn->dn_object, 0, blkid);
1785
1786 (void) arc_read(NULL, dn->dn_objset->os_spa,
1787 bp, NULL, NULL, prio,
1788 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1789 &aflags, &zb);
1790 }
1791 if (db)
1792 dbuf_rele(db, NULL);
1793 }
1794 }
1795
1796 /*
1797 * Returns with db_holds incremented, and db_mtx not held.
1798 * Note: dn_struct_rwlock must be held.
1799 */
1800 int
1801 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1802 void *tag, dmu_buf_impl_t **dbp)
1803 {
1804 dmu_buf_impl_t *db, *parent = NULL;
1805
1806 ASSERT(blkid != DMU_BONUS_BLKID);
1807 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1808 ASSERT3U(dn->dn_nlevels, >, level);
1809
1810 *dbp = NULL;
1811 top:
1812 /* dbuf_find() returns with db_mtx held */
1813 db = dbuf_find(dn, level, blkid);
1814
1815 if (db == NULL) {
1816 blkptr_t *bp = NULL;
1817 int err;
1818
1819 ASSERT3P(parent, ==, NULL);
1820 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1821 if (fail_sparse) {
1822 if (err == 0 && bp && BP_IS_HOLE(bp))
1823 err = SET_ERROR(ENOENT);
1824 if (err) {
1825 if (parent)
1826 dbuf_rele(parent, NULL);
1827 return (err);
1828 }
1829 }
1830 if (err && err != ENOENT)
1831 return (err);
1832 db = dbuf_create(dn, level, blkid, parent, bp);
1833 }
1834
1835 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1836 arc_buf_add_ref(db->db_buf, db);
1837 if (db->db_buf->b_data == NULL) {
1838 dbuf_clear(db);
1839 if (parent) {
1840 dbuf_rele(parent, NULL);
1841 parent = NULL;
1842 }
1843 goto top;
1844 }
1845 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1846 }
1847
1848 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1849
1850 /*
1851 * If this buffer is currently syncing out, and we are are
1852 * still referencing it from db_data, we need to make a copy
1853 * of it in case we decide we want to dirty it again in this txg.
1854 */
1855 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1856 dn->dn_object != DMU_META_DNODE_OBJECT &&
1857 db->db_state == DB_CACHED && db->db_data_pending) {
1858 dbuf_dirty_record_t *dr = db->db_data_pending;
1859
1860 if (dr->dt.dl.dr_data == db->db_buf) {
1861 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1862
1863 dbuf_set_data(db,
1864 arc_buf_alloc(dn->dn_objset->os_spa,
1865 db->db.db_size, db, type));
1866 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1867 db->db.db_size);
1868 }
1869 }
1870
1871 (void) refcount_add(&db->db_holds, tag);
1872 dbuf_update_data(db);
1873 DBUF_VERIFY(db);
1874 mutex_exit(&db->db_mtx);
1875
1876 /* NOTE: we can't rele the parent until after we drop the db_mtx */
1877 if (parent)
1878 dbuf_rele(parent, NULL);
1879
1880 ASSERT3P(DB_DNODE(db), ==, dn);
1881 ASSERT3U(db->db_blkid, ==, blkid);
1882 ASSERT3U(db->db_level, ==, level);
1883 *dbp = db;
1884
1885 return (0);
1886 }
1887
1888 dmu_buf_impl_t *
1889 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1890 {
1891 dmu_buf_impl_t *db;
1892 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1893 return (err ? NULL : db);
1894 }
1895
1896 dmu_buf_impl_t *
1897 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1898 {
1899 dmu_buf_impl_t *db;
1900 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1901 return (err ? NULL : db);
1902 }
1903
1904 void
1905 dbuf_create_bonus(dnode_t *dn)
1906 {
1907 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1908
1909 ASSERT(dn->dn_bonus == NULL);
1910 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1911 }
1912
1913 int
1914 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
1915 {
1916 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1917 dnode_t *dn;
1918
1919 if (db->db_blkid != DMU_SPILL_BLKID)
1920 return (SET_ERROR(ENOTSUP));
1921 if (blksz == 0)
1922 blksz = SPA_MINBLOCKSIZE;
1923 if (blksz > SPA_MAXBLOCKSIZE)
1924 blksz = SPA_MAXBLOCKSIZE;
1925 else
1926 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
1927
1928 DB_DNODE_ENTER(db);
1929 dn = DB_DNODE(db);
1930 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1931 dbuf_new_size(db, blksz, tx);
1932 rw_exit(&dn->dn_struct_rwlock);
1933 DB_DNODE_EXIT(db);
1934
1935 return (0);
1936 }
1937
1938 void
1939 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
1940 {
1941 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
1942 }
1943
1944 #pragma weak dmu_buf_add_ref = dbuf_add_ref
1945 void
1946 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1947 {
1948 int64_t holds = refcount_add(&db->db_holds, tag);
1949 ASSERT(holds > 1);
1950 }
1951
1952 /*
1953 * If you call dbuf_rele() you had better not be referencing the dnode handle
1954 * unless you have some other direct or indirect hold on the dnode. (An indirect
1955 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
1956 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
1957 * dnode's parent dbuf evicting its dnode handles.
1958 */
1959 void
1960 dbuf_rele(dmu_buf_impl_t *db, void *tag)
1961 {
1962 mutex_enter(&db->db_mtx);
1963 dbuf_rele_and_unlock(db, tag);
1964 }
1965
1966 void
1967 dmu_buf_rele(dmu_buf_t *db, void *tag)
1968 {
1969 dbuf_rele((dmu_buf_impl_t *)db, tag);
1970 }
1971
1972 /*
1973 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
1974 * db_dirtycnt and db_holds to be updated atomically.
1975 */
1976 void
1977 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
1978 {
1979 int64_t holds;
1980
1981 ASSERT(MUTEX_HELD(&db->db_mtx));
1982 DBUF_VERIFY(db);
1983
1984 /*
1985 * Remove the reference to the dbuf before removing its hold on the
1986 * dnode so we can guarantee in dnode_move() that a referenced bonus
1987 * buffer has a corresponding dnode hold.
1988 */
1989 holds = refcount_remove(&db->db_holds, tag);
1990 ASSERT(holds >= 0);
1991
1992 /*
1993 * We can't freeze indirects if there is a possibility that they
1994 * may be modified in the current syncing context.
1995 */
1996 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
1997 arc_buf_freeze(db->db_buf);
1998
1999 if (holds == db->db_dirtycnt &&
2000 db->db_level == 0 && db->db_immediate_evict)
2001 dbuf_evict_user(db);
2002
2003 if (holds == 0) {
2004 if (db->db_blkid == DMU_BONUS_BLKID) {
2005 mutex_exit(&db->db_mtx);
2006
2007 /*
2008 * If the dnode moves here, we cannot cross this barrier
2009 * until the move completes.
2010 */
2011 DB_DNODE_ENTER(db);
2012 atomic_dec_32(&DB_DNODE(db)->dn_dbufs_count);
2013 DB_DNODE_EXIT(db);
2014 /*
2015 * The bonus buffer's dnode hold is no longer discounted
2016 * in dnode_move(). The dnode cannot move until after
2017 * the dnode_rele().
2018 */
2019 dnode_rele(DB_DNODE(db), db);
2020 } else if (db->db_buf == NULL) {
2021 /*
2022 * This is a special case: we never associated this
2023 * dbuf with any data allocated from the ARC.
2024 */
2025 ASSERT(db->db_state == DB_UNCACHED ||
2026 db->db_state == DB_NOFILL);
2027 dbuf_evict(db);
2028 } else if (arc_released(db->db_buf)) {
2029 arc_buf_t *buf = db->db_buf;
2030 /*
2031 * This dbuf has anonymous data associated with it.
2032 */
2033 dbuf_set_data(db, NULL);
2034 VERIFY(arc_buf_remove_ref(buf, db));
2035 dbuf_evict(db);
2036 } else {
2037 VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2038
2039 /*
2040 * A dbuf will be eligible for eviction if either the
2041 * 'primarycache' property is set or a duplicate
2042 * copy of this buffer is already cached in the arc.
2043 *
2044 * In the case of the 'primarycache' a buffer
2045 * is considered for eviction if it matches the
2046 * criteria set in the property.
2047 *
2048 * To decide if our buffer is considered a
2049 * duplicate, we must call into the arc to determine
2050 * if multiple buffers are referencing the same
2051 * block on-disk. If so, then we simply evict
2052 * ourselves.
2053 */
2054 if (!DBUF_IS_CACHEABLE(db)) {
2055 if (db->db_blkptr != NULL &&
2056 !BP_IS_HOLE(db->db_blkptr) &&
2057 !BP_IS_EMBEDDED(db->db_blkptr)) {
2058 spa_t *spa =
2059 dmu_objset_spa(db->db_objset);
2060 blkptr_t bp = *db->db_blkptr;
2061 dbuf_clear(db);
2062 arc_freed(spa, &bp);
2063 } else {
2064 dbuf_clear(db);
2065 }
2066 } else if (arc_buf_eviction_needed(db->db_buf)) {
2067 dbuf_clear(db);
2068 } else {
2069 mutex_exit(&db->db_mtx);
2070 }
2071 }
2072 } else {
2073 mutex_exit(&db->db_mtx);
2074 }
2075 }
2076
2077 #pragma weak dmu_buf_refcount = dbuf_refcount
2078 uint64_t
2079 dbuf_refcount(dmu_buf_impl_t *db)
2080 {
2081 return (refcount_count(&db->db_holds));
2082 }
2083
2084 void *
2085 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2086 dmu_buf_evict_func_t *evict_func)
2087 {
2088 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2089 user_data_ptr_ptr, evict_func));
2090 }
2091
2092 void *
2093 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2094 dmu_buf_evict_func_t *evict_func)
2095 {
2096 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2097
2098 db->db_immediate_evict = TRUE;
2099 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2100 user_data_ptr_ptr, evict_func));
2101 }
2102
2103 void *
2104 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2105 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2106 {
2107 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2108 ASSERT(db->db_level == 0);
2109
2110 ASSERT((user_ptr == NULL) == (evict_func == NULL));
2111
2112 mutex_enter(&db->db_mtx);
2113
2114 if (db->db_user_ptr == old_user_ptr) {
2115 db->db_user_ptr = user_ptr;
2116 db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2117 db->db_evict_func = evict_func;
2118
2119 dbuf_update_data(db);
2120 } else {
2121 old_user_ptr = db->db_user_ptr;
2122 }
2123
2124 mutex_exit(&db->db_mtx);
2125 return (old_user_ptr);
2126 }
2127
2128 void *
2129 dmu_buf_get_user(dmu_buf_t *db_fake)
2130 {
2131 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2132 ASSERT(!refcount_is_zero(&db->db_holds));
2133
2134 return (db->db_user_ptr);
2135 }
2136
2137 boolean_t
2138 dmu_buf_freeable(dmu_buf_t *dbuf)
2139 {
2140 boolean_t res = B_FALSE;
2141 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2142
2143 if (db->db_blkptr)
2144 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2145 db->db_blkptr, db->db_blkptr->blk_birth);
2146
2147 return (res);
2148 }
2149
2150 blkptr_t *
2151 dmu_buf_get_blkptr(dmu_buf_t *db)
2152 {
2153 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2154 return (dbi->db_blkptr);
2155 }
2156
2157 static void
2158 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2159 {
2160 /* ASSERT(dmu_tx_is_syncing(tx) */
2161 ASSERT(MUTEX_HELD(&db->db_mtx));
2162
2163 if (db->db_blkptr != NULL)
2164 return;
2165
2166 if (db->db_blkid == DMU_SPILL_BLKID) {
2167 db->db_blkptr = &dn->dn_phys->dn_spill;
2168 BP_ZERO(db->db_blkptr);
2169 return;
2170 }
2171 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2172 /*
2173 * This buffer was allocated at a time when there was
2174 * no available blkptrs from the dnode, or it was
2175 * inappropriate to hook it in (i.e., nlevels mis-match).
2176 */
2177 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2178 ASSERT(db->db_parent == NULL);
2179 db->db_parent = dn->dn_dbuf;
2180 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2181 DBUF_VERIFY(db);
2182 } else {
2183 dmu_buf_impl_t *parent = db->db_parent;
2184 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2185
2186 ASSERT(dn->dn_phys->dn_nlevels > 1);
2187 if (parent == NULL) {
2188 mutex_exit(&db->db_mtx);
2189 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2190 (void) dbuf_hold_impl(dn, db->db_level+1,
2191 db->db_blkid >> epbs, FALSE, db, &parent);
2192 rw_exit(&dn->dn_struct_rwlock);
2193 mutex_enter(&db->db_mtx);
2194 db->db_parent = parent;
2195 }
2196 db->db_blkptr = (blkptr_t *)parent->db.db_data +
2197 (db->db_blkid & ((1ULL << epbs) - 1));
2198 DBUF_VERIFY(db);
2199 }
2200 }
2201
2202 static void
2203 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2204 {
2205 dmu_buf_impl_t *db = dr->dr_dbuf;
2206 dnode_t *dn;
2207 zio_t *zio;
2208
2209 ASSERT(dmu_tx_is_syncing(tx));
2210
2211 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2212
2213 mutex_enter(&db->db_mtx);
2214
2215 ASSERT(db->db_level > 0);
2216 DBUF_VERIFY(db);
2217
2218 /* Read the block if it hasn't been read yet. */
2219 if (db->db_buf == NULL) {
2220 mutex_exit(&db->db_mtx);
2221 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2222 mutex_enter(&db->db_mtx);
2223 }
2224 ASSERT3U(db->db_state, ==, DB_CACHED);
2225 ASSERT(db->db_buf != NULL);
2226
2227 DB_DNODE_ENTER(db);
2228 dn = DB_DNODE(db);
2229 /* Indirect block size must match what the dnode thinks it is. */
2230 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2231 dbuf_check_blkptr(dn, db);
2232 DB_DNODE_EXIT(db);
2233
2234 /* Provide the pending dirty record to child dbufs */
2235 db->db_data_pending = dr;
2236
2237 mutex_exit(&db->db_mtx);
2238 dbuf_write(dr, db->db_buf, tx);
2239
2240 zio = dr->dr_zio;
2241 mutex_enter(&dr->dt.di.dr_mtx);
2242 dbuf_sync_list(&dr->dt.di.dr_children, tx);
2243 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2244 mutex_exit(&dr->dt.di.dr_mtx);
2245 zio_nowait(zio);
2246 }
2247
2248 static void
2249 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2250 {
2251 arc_buf_t **datap = &dr->dt.dl.dr_data;
2252 dmu_buf_impl_t *db = dr->dr_dbuf;
2253 dnode_t *dn;
2254 objset_t *os;
2255 uint64_t txg = tx->tx_txg;
2256
2257 ASSERT(dmu_tx_is_syncing(tx));
2258
2259 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2260
2261 mutex_enter(&db->db_mtx);
2262 /*
2263 * To be synced, we must be dirtied. But we
2264 * might have been freed after the dirty.
2265 */
2266 if (db->db_state == DB_UNCACHED) {
2267 /* This buffer has been freed since it was dirtied */
2268 ASSERT(db->db.db_data == NULL);
2269 } else if (db->db_state == DB_FILL) {
2270 /* This buffer was freed and is now being re-filled */
2271 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2272 } else {
2273 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2274 }
2275 DBUF_VERIFY(db);
2276
2277 DB_DNODE_ENTER(db);
2278 dn = DB_DNODE(db);
2279
2280 if (db->db_blkid == DMU_SPILL_BLKID) {
2281 mutex_enter(&dn->dn_mtx);
2282 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2283 mutex_exit(&dn->dn_mtx);
2284 }
2285
2286 /*
2287 * If this is a bonus buffer, simply copy the bonus data into the
2288 * dnode. It will be written out when the dnode is synced (and it
2289 * will be synced, since it must have been dirty for dbuf_sync to
2290 * be called).
2291 */
2292 if (db->db_blkid == DMU_BONUS_BLKID) {
2293 dbuf_dirty_record_t **drp;
2294
2295 ASSERT(*datap != NULL);
2296 ASSERT0(db->db_level);
2297 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2298 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2299 DB_DNODE_EXIT(db);
2300
2301 if (*datap != db->db.db_data) {
2302 zio_buf_free(*datap, DN_MAX_BONUSLEN);
2303 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2304 }
2305 db->db_data_pending = NULL;
2306 drp = &db->db_last_dirty;
2307 while (*drp != dr)
2308 drp = &(*drp)->dr_next;
2309 ASSERT(dr->dr_next == NULL);
2310 ASSERT(dr->dr_dbuf == db);
2311 *drp = dr->dr_next;
2312 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2313 ASSERT(db->db_dirtycnt > 0);
2314 db->db_dirtycnt -= 1;
2315 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2316 return;
2317 }
2318
2319 os = dn->dn_objset;
2320
2321 /*
2322 * This function may have dropped the db_mtx lock allowing a dmu_sync
2323 * operation to sneak in. As a result, we need to ensure that we
2324 * don't check the dr_override_state until we have returned from
2325 * dbuf_check_blkptr.
2326 */
2327 dbuf_check_blkptr(dn, db);
2328
2329 /*
2330 * If this buffer is in the middle of an immediate write,
2331 * wait for the synchronous IO to complete.
2332 */
2333 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2334 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2335 cv_wait(&db->db_changed, &db->db_mtx);
2336 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2337 }
2338
2339 if (db->db_state != DB_NOFILL &&
2340 dn->dn_object != DMU_META_DNODE_OBJECT &&
2341 refcount_count(&db->db_holds) > 1 &&
2342 dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2343 *datap == db->db_buf) {
2344 /*
2345 * If this buffer is currently "in use" (i.e., there
2346 * are active holds and db_data still references it),
2347 * then make a copy before we start the write so that
2348 * any modifications from the open txg will not leak
2349 * into this write.
2350 *
2351 * NOTE: this copy does not need to be made for
2352 * objects only modified in the syncing context (e.g.
2353 * DNONE_DNODE blocks).
2354 */
2355 int blksz = arc_buf_size(*datap);
2356 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2357 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2358 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2359 }
2360 db->db_data_pending = dr;
2361
2362 mutex_exit(&db->db_mtx);
2363
2364 dbuf_write(dr, *datap, tx);
2365
2366 ASSERT(!list_link_active(&dr->dr_dirty_node));
2367 if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2368 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2369 DB_DNODE_EXIT(db);
2370 } else {
2371 /*
2372 * Although zio_nowait() does not "wait for an IO", it does
2373 * initiate the IO. If this is an empty write it seems plausible
2374 * that the IO could actually be completed before the nowait
2375 * returns. We need to DB_DNODE_EXIT() first in case
2376 * zio_nowait() invalidates the dbuf.
2377 */
2378 DB_DNODE_EXIT(db);
2379 zio_nowait(dr->dr_zio);
2380 }
2381 }
2382
2383 void
2384 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2385 {
2386 dbuf_dirty_record_t *dr;
2387
2388 while (dr = list_head(list)) {
2389 if (dr->dr_zio != NULL) {
2390 /*
2391 * If we find an already initialized zio then we
2392 * are processing the meta-dnode, and we have finished.
2393 * The dbufs for all dnodes are put back on the list
2394 * during processing, so that we can zio_wait()
2395 * these IOs after initiating all child IOs.
2396 */
2397 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2398 DMU_META_DNODE_OBJECT);
2399 break;
2400 }
2401 list_remove(list, dr);
2402 if (dr->dr_dbuf->db_level > 0)
2403 dbuf_sync_indirect(dr, tx);
2404 else
2405 dbuf_sync_leaf(dr, tx);
2406 }
2407 }
2408
2409 /* ARGSUSED */
2410 static void
2411 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2412 {
2413 dmu_buf_impl_t *db = vdb;
2414 dnode_t *dn;
2415 blkptr_t *bp = zio->io_bp;
2416 blkptr_t *bp_orig = &zio->io_bp_orig;
2417 spa_t *spa = zio->io_spa;
2418 int64_t delta;
2419 uint64_t fill = 0;
2420 int i;
2421
2422 ASSERT3P(db->db_blkptr, ==, bp);
2423
2424 DB_DNODE_ENTER(db);
2425 dn = DB_DNODE(db);
2426 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2427 dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2428 zio->io_prev_space_delta = delta;
2429
2430 if (bp->blk_birth != 0) {
2431 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2432 BP_GET_TYPE(bp) == dn->dn_type) ||
2433 (db->db_blkid == DMU_SPILL_BLKID &&
2434 BP_GET_TYPE(bp) == dn->dn_bonustype) ||
2435 BP_IS_EMBEDDED(bp));
2436 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2437 }
2438
2439 mutex_enter(&db->db_mtx);
2440
2441 #ifdef ZFS_DEBUG
2442 if (db->db_blkid == DMU_SPILL_BLKID) {
2443 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2444 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2445 db->db_blkptr == &dn->dn_phys->dn_spill);
2446 }
2447 #endif
2448
2449 if (db->db_level == 0) {
2450 mutex_enter(&dn->dn_mtx);
2451 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2452 db->db_blkid != DMU_SPILL_BLKID)
2453 dn->dn_phys->dn_maxblkid = db->db_blkid;
2454 mutex_exit(&dn->dn_mtx);
2455
2456 if (dn->dn_type == DMU_OT_DNODE) {
2457 dnode_phys_t *dnp = db->db.db_data;
2458 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2459 i--, dnp++) {
2460 if (dnp->dn_type != DMU_OT_NONE)
2461 fill++;
2462 }
2463 } else {
2464 if (BP_IS_HOLE(bp)) {
2465 fill = 0;
2466 } else {
2467 fill = 1;
2468 }
2469 }
2470 } else {
2471 blkptr_t *ibp = db->db.db_data;
2472 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2473 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2474 if (BP_IS_HOLE(ibp))
2475 continue;
2476 fill += BP_GET_FILL(ibp);
2477 }
2478 }
2479 DB_DNODE_EXIT(db);
2480
2481 if (!BP_IS_EMBEDDED(bp))
2482 bp->blk_fill = fill;
2483
2484 mutex_exit(&db->db_mtx);
2485 }
2486
2487 /*
2488 * The SPA will call this callback several times for each zio - once
2489 * for every physical child i/o (zio->io_phys_children times). This
2490 * allows the DMU to monitor the progress of each logical i/o. For example,
2491 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2492 * block. There may be a long delay before all copies/fragments are completed,
2493 * so this callback allows us to retire dirty space gradually, as the physical
2494 * i/os complete.
2495 */
2496 /* ARGSUSED */
2497 static void
2498 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2499 {
2500 dmu_buf_impl_t *db = arg;
2501 objset_t *os = db->db_objset;
2502 dsl_pool_t *dp = dmu_objset_pool(os);
2503 dbuf_dirty_record_t *dr;
2504 int delta = 0;
2505
2506 dr = db->db_data_pending;
2507 ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2508
2509 /*
2510 * The callback will be called io_phys_children times. Retire one
2511 * portion of our dirty space each time we are called. Any rounding
2512 * error will be cleaned up by dsl_pool_sync()'s call to
2513 * dsl_pool_undirty_space().
2514 */
2515 delta = dr->dr_accounted / zio->io_phys_children;
2516 dsl_pool_undirty_space(dp, delta, zio->io_txg);
2517 }
2518
2519 /* ARGSUSED */
2520 static void
2521 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2522 {
2523 dmu_buf_impl_t *db = vdb;
2524 blkptr_t *bp_orig = &zio->io_bp_orig;
2525 blkptr_t *bp = db->db_blkptr;
2526 objset_t *os = db->db_objset;
2527 dmu_tx_t *tx = os->os_synctx;
2528 dbuf_dirty_record_t **drp, *dr;
2529
2530 ASSERT0(zio->io_error);
2531 ASSERT(db->db_blkptr == bp);
2532
2533 /*
2534 * For nopwrites and rewrites we ensure that the bp matches our
2535 * original and bypass all the accounting.
2536 */
2537 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2538 ASSERT(BP_EQUAL(bp, bp_orig));
2539 } else {
2540 dsl_dataset_t *ds = os->os_dsl_dataset;
2541 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2542 dsl_dataset_block_born(ds, bp, tx);
2543 }
2544
2545 mutex_enter(&db->db_mtx);
2546
2547 DBUF_VERIFY(db);
2548
2549 drp = &db->db_last_dirty;
2550 while ((dr = *drp) != db->db_data_pending)
2551 drp = &dr->dr_next;
2552 ASSERT(!list_link_active(&dr->dr_dirty_node));
2553 ASSERT(dr->dr_dbuf == db);
2554 ASSERT(dr->dr_next == NULL);
2555 *drp = dr->dr_next;
2556
2557 #ifdef ZFS_DEBUG
2558 if (db->db_blkid == DMU_SPILL_BLKID) {
2559 dnode_t *dn;
2560
2561 DB_DNODE_ENTER(db);
2562 dn = DB_DNODE(db);
2563 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2564 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2565 db->db_blkptr == &dn->dn_phys->dn_spill);
2566 DB_DNODE_EXIT(db);
2567 }
2568 #endif
2569
2570 if (db->db_level == 0) {
2571 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2572 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2573 if (db->db_state != DB_NOFILL) {
2574 if (dr->dt.dl.dr_data != db->db_buf)
2575 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2576 db));
2577 else if (!arc_released(db->db_buf))
2578 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2579 }
2580 } else {
2581 dnode_t *dn;
2582
2583 DB_DNODE_ENTER(db);
2584 dn = DB_DNODE(db);
2585 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2586 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
2587 if (!BP_IS_HOLE(db->db_blkptr)) {
2588 int epbs =
2589 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2590 ASSERT3U(db->db_blkid, <=,
2591 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
2592 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2593 db->db.db_size);
2594 if (!arc_released(db->db_buf))
2595 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2596 }
2597 DB_DNODE_EXIT(db);
2598 mutex_destroy(&dr->dt.di.dr_mtx);
2599 list_destroy(&dr->dt.di.dr_children);
2600 }
2601 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2602
2603 cv_broadcast(&db->db_changed);
2604 ASSERT(db->db_dirtycnt > 0);
2605 db->db_dirtycnt -= 1;
2606 db->db_data_pending = NULL;
2607 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
2608 }
2609
2610 static void
2611 dbuf_write_nofill_ready(zio_t *zio)
2612 {
2613 dbuf_write_ready(zio, NULL, zio->io_private);
2614 }
2615
2616 static void
2617 dbuf_write_nofill_done(zio_t *zio)
2618 {
2619 dbuf_write_done(zio, NULL, zio->io_private);
2620 }
2621
2622 static void
2623 dbuf_write_override_ready(zio_t *zio)
2624 {
2625 dbuf_dirty_record_t *dr = zio->io_private;
2626 dmu_buf_impl_t *db = dr->dr_dbuf;
2627
2628 dbuf_write_ready(zio, NULL, db);
2629 }
2630
2631 static void
2632 dbuf_write_override_done(zio_t *zio)
2633 {
2634 dbuf_dirty_record_t *dr = zio->io_private;
2635 dmu_buf_impl_t *db = dr->dr_dbuf;
2636 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2637
2638 mutex_enter(&db->db_mtx);
2639 if (!BP_EQUAL(zio->io_bp, obp)) {
2640 if (!BP_IS_HOLE(obp))
2641 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2642 arc_release(dr->dt.dl.dr_data, db);
2643 }
2644 mutex_exit(&db->db_mtx);
2645
2646 dbuf_write_done(zio, NULL, db);
2647 }
2648
2649 /* Issue I/O to commit a dirty buffer to disk. */
2650 static void
2651 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2652 {
2653 dmu_buf_impl_t *db = dr->dr_dbuf;
2654 dnode_t *dn;
2655 objset_t *os;
2656 dmu_buf_impl_t *parent = db->db_parent;
2657 uint64_t txg = tx->tx_txg;
2658 zbookmark_phys_t zb;
2659 zio_prop_t zp;
2660 zio_t *zio;
2661 int wp_flag = 0;
2662
2663 DB_DNODE_ENTER(db);
2664 dn = DB_DNODE(db);
2665 os = dn->dn_objset;
2666
2667 if (db->db_state != DB_NOFILL) {
2668 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2669 /*
2670 * Private object buffers are released here rather
2671 * than in dbuf_dirty() since they are only modified
2672 * in the syncing context and we don't want the
2673 * overhead of making multiple copies of the data.
2674 */
2675 if (BP_IS_HOLE(db->db_blkptr)) {
2676 arc_buf_thaw(data);
2677 } else {
2678 dbuf_release_bp(db);
2679 }
2680 }
2681 }
2682
2683 if (parent != dn->dn_dbuf) {
2684 /* Our parent is an indirect block. */
2685 /* We have a dirty parent that has been scheduled for write. */
2686 ASSERT(parent && parent->db_data_pending);
2687 /* Our parent's buffer is one level closer to the dnode. */
2688 ASSERT(db->db_level == parent->db_level-1);
2689 /*
2690 * We're about to modify our parent's db_data by modifying
2691 * our block pointer, so the parent must be released.
2692 */
2693 ASSERT(arc_released(parent->db_buf));
2694 zio = parent->db_data_pending->dr_zio;
2695 } else {
2696 /* Our parent is the dnode itself. */
2697 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2698 db->db_blkid != DMU_SPILL_BLKID) ||
2699 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2700 if (db->db_blkid != DMU_SPILL_BLKID)
2701 ASSERT3P(db->db_blkptr, ==,
2702 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2703 zio = dn->dn_zio;
2704 }
2705
2706 ASSERT(db->db_level == 0 || data == db->db_buf);
2707 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2708 ASSERT(zio);
2709
2710 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2711 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2712 db->db.db_object, db->db_level, db->db_blkid);
2713
2714 if (db->db_blkid == DMU_SPILL_BLKID)
2715 wp_flag = WP_SPILL;
2716 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2717 WP_SET_SPECIALCLASS(wp_flag, dr->dr_usesc);
2718
2719 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2720 DB_DNODE_EXIT(db);
2721
2722 if (db->db_level == 0 &&
2723 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2724 /*
2725 * The BP for this block has been provided by open context
2726 * (by dmu_sync() or dmu_buf_write_embedded()).
2727 */
2728 void *contents = (data != NULL) ? data->b_data : NULL;
2729
2730 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2731 db->db_blkptr, contents, db->db.db_size, &zp,
2732 dbuf_write_override_ready, NULL, dbuf_write_override_done,
2733 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2734 mutex_enter(&db->db_mtx);
2735 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2736 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2737 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2738 mutex_exit(&db->db_mtx);
2739 } else if (db->db_state == DB_NOFILL) {
2740 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
2741 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
2742 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2743 db->db_blkptr, NULL, db->db.db_size, &zp,
2744 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2745 ZIO_PRIORITY_ASYNC_WRITE,
2746 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2747 } else {
2748 ASSERT(arc_released(data));
2749 dr->dr_zio = arc_write(zio, os->os_spa, txg,
2750 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2751 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2752 dbuf_write_physdone, dbuf_write_done, db,
2753 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2754 }
2755 }