patch nuke-the-dbuf-hash patch make-the-merge-easy
1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_send.h> 32 #include <sys/dmu_impl.h> 33 #include <sys/dbuf.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/dsl_dataset.h> 36 #include <sys/dsl_dir.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/spa.h> 39 #include <sys/spa_impl.h> 40 #include <sys/zio.h> 41 #include <sys/dmu_zfetch.h> 42 #include <sys/sa.h> 43 #include <sys/sa_impl.h> 44 #include <sys/zfeature.h> 45 #include <sys/blkptr.h> 46 #include <sys/range_tree.h> 47 48 /* 49 * Number of times that zfs_free_range() took the slow path while doing 50 * a zfs receive. A nonzero value indicates a potential performance problem. 51 */ 52 uint64_t zfs_free_range_recv_miss; 53 54 static void dbuf_destroy(dmu_buf_impl_t *db); 55 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 56 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 57 58 /* 59 * Global data structures and functions for the dbuf cache. 60 */ 61 static kmem_cache_t *dbuf_cache; 62 63 /* ARGSUSED */ 64 static int 65 dbuf_cons(void *vdb, void *unused, int kmflag) 66 { 67 dmu_buf_impl_t *db = vdb; 68 bzero(db, sizeof (dmu_buf_impl_t)); 69 70 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 71 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 72 refcount_create(&db->db_holds); 73 74 return (0); 75 } 76 77 /* ARGSUSED */ 78 static void 79 dbuf_dest(void *vdb, void *unused) 80 { 81 dmu_buf_impl_t *db = vdb; 82 mutex_destroy(&db->db_mtx); 83 cv_destroy(&db->db_changed); 84 refcount_destroy(&db->db_holds); 85 } 86 87 dmu_buf_impl_t * 88 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 89 { 90 objset_t *os = dn->dn_objset; 91 uint64_t obj = dn->dn_object; 92 dmu_buf_impl_t *db; 93 dmu_buf_impl_t key; 94 avl_index_t where; 95 96 key.db_level = level; 97 key.db_blkid = blkid; 98 key.db_state = DB_SEARCH; 99 100 mutex_enter(&dn->dn_dbufs_mtx); 101 db = avl_find(&dn->dn_dbufs, &key, &where); 102 ASSERT3P(db, ==, NULL); 103 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 104 105 for (; db; db = AVL_NEXT(&dn->dn_dbufs, db)) { 106 if ((db->db_level != level) || (db->db_blkid != blkid)) 107 break; 108 109 mutex_enter(&db->db_mtx); 110 if (db->db_state != DB_EVICTING) { 111 mutex_exit(&dn->dn_dbufs_mtx); 112 return (db); 113 } 114 mutex_exit(&db->db_mtx); 115 } 116 117 mutex_exit(&dn->dn_dbufs_mtx); 118 return (NULL); 119 } 120 121 static arc_evict_func_t dbuf_do_evict; 122 123 static void 124 dbuf_evict_user(dmu_buf_impl_t *db) 125 { 126 ASSERT(MUTEX_HELD(&db->db_mtx)); 127 128 if (db->db_level != 0 || db->db_evict_func == NULL) 129 return; 130 131 if (db->db_user_data_ptr_ptr) 132 *db->db_user_data_ptr_ptr = db->db.db_data; 133 db->db_evict_func(&db->db, db->db_user_ptr); 134 db->db_user_ptr = NULL; 135 db->db_user_data_ptr_ptr = NULL; 136 db->db_evict_func = NULL; 137 } 138 139 boolean_t 140 dbuf_is_metadata(dmu_buf_impl_t *db) 141 { 142 if (db->db_level > 0) { 143 return (B_TRUE); 144 } else { 145 boolean_t is_metadata; 146 147 DB_DNODE_ENTER(db); 148 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 149 DB_DNODE_EXIT(db); 150 151 return (is_metadata); 152 } 153 } 154 155 void 156 dbuf_evict(dmu_buf_impl_t *db) 157 { 158 ASSERT(MUTEX_HELD(&db->db_mtx)); 159 ASSERT(db->db_buf == NULL); 160 ASSERT(db->db_data_pending == NULL); 161 162 dbuf_clear(db); 163 dbuf_destroy(db); 164 } 165 166 void 167 dbuf_init(void) 168 { 169 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 170 sizeof (dmu_buf_impl_t), 171 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 172 } 173 174 void 175 dbuf_fini(void) 176 { 177 kmem_cache_destroy(dbuf_cache); 178 } 179 180 /* 181 * Other stuff. 182 */ 183 184 #ifdef ZFS_DEBUG 185 static void 186 dbuf_verify(dmu_buf_impl_t *db) 187 { 188 dnode_t *dn; 189 dbuf_dirty_record_t *dr; 190 191 ASSERT(MUTEX_HELD(&db->db_mtx)); 192 193 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 194 return; 195 196 ASSERT(db->db_objset != NULL); 197 DB_DNODE_ENTER(db); 198 dn = DB_DNODE(db); 199 if (dn == NULL) { 200 ASSERT(db->db_parent == NULL); 201 ASSERT(db->db_blkptr == NULL); 202 } else { 203 ASSERT3U(db->db.db_object, ==, dn->dn_object); 204 ASSERT3P(db->db_objset, ==, dn->dn_objset); 205 ASSERT3U(db->db_level, <, dn->dn_nlevels); 206 ASSERT(db->db_blkid == DMU_BONUS_BLKID || 207 db->db_blkid == DMU_SPILL_BLKID || 208 !avl_is_empty(&dn->dn_dbufs)); 209 } 210 if (db->db_blkid == DMU_BONUS_BLKID) { 211 ASSERT(dn != NULL); 212 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 213 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 214 } else if (db->db_blkid == DMU_SPILL_BLKID) { 215 ASSERT(dn != NULL); 216 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 217 ASSERT0(db->db.db_offset); 218 } else { 219 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 220 } 221 222 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 223 ASSERT(dr->dr_dbuf == db); 224 225 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 226 ASSERT(dr->dr_dbuf == db); 227 228 /* 229 * We can't assert that db_size matches dn_datablksz because it 230 * can be momentarily different when another thread is doing 231 * dnode_set_blksz(). 232 */ 233 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 234 dr = db->db_data_pending; 235 /* 236 * It should only be modified in syncing context, so 237 * make sure we only have one copy of the data. 238 */ 239 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 240 } 241 242 /* verify db->db_blkptr */ 243 if (db->db_blkptr) { 244 if (db->db_parent == dn->dn_dbuf) { 245 /* db is pointed to by the dnode */ 246 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 247 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 248 ASSERT(db->db_parent == NULL); 249 else 250 ASSERT(db->db_parent != NULL); 251 if (db->db_blkid != DMU_SPILL_BLKID) 252 ASSERT3P(db->db_blkptr, ==, 253 &dn->dn_phys->dn_blkptr[db->db_blkid]); 254 } else { 255 /* db is pointed to by an indirect block */ 256 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 257 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 258 ASSERT3U(db->db_parent->db.db_object, ==, 259 db->db.db_object); 260 /* 261 * dnode_grow_indblksz() can make this fail if we don't 262 * have the struct_rwlock. XXX indblksz no longer 263 * grows. safe to do this now? 264 */ 265 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 266 ASSERT3P(db->db_blkptr, ==, 267 ((blkptr_t *)db->db_parent->db.db_data + 268 db->db_blkid % epb)); 269 } 270 } 271 } 272 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 273 (db->db_buf == NULL || db->db_buf->b_data) && 274 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 275 db->db_state != DB_FILL && !dn->dn_free_txg) { 276 /* 277 * If the blkptr isn't set but they have nonzero data, 278 * it had better be dirty, otherwise we'll lose that 279 * data when we evict this buffer. 280 */ 281 if (db->db_dirtycnt == 0) { 282 uint64_t *buf = db->db.db_data; 283 int i; 284 285 for (i = 0; i < db->db.db_size >> 3; i++) { 286 ASSERT(buf[i] == 0); 287 } 288 } 289 } 290 DB_DNODE_EXIT(db); 291 } 292 #endif 293 294 static void 295 dbuf_update_data(dmu_buf_impl_t *db) 296 { 297 ASSERT(MUTEX_HELD(&db->db_mtx)); 298 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 299 ASSERT(!refcount_is_zero(&db->db_holds)); 300 *db->db_user_data_ptr_ptr = db->db.db_data; 301 } 302 } 303 304 static void 305 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 306 { 307 ASSERT(MUTEX_HELD(&db->db_mtx)); 308 db->db_buf = buf; 309 if (buf != NULL) { 310 ASSERT(buf->b_data != NULL); 311 db->db.db_data = buf->b_data; 312 if (!arc_released(buf)) 313 arc_set_callback(buf, dbuf_do_evict, db); 314 dbuf_update_data(db); 315 } else { 316 dbuf_evict_user(db); 317 db->db.db_data = NULL; 318 if (db->db_state != DB_NOFILL) 319 db->db_state = DB_UNCACHED; 320 } 321 } 322 323 /* 324 * Loan out an arc_buf for read. Return the loaned arc_buf. 325 */ 326 arc_buf_t * 327 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 328 { 329 arc_buf_t *abuf; 330 331 mutex_enter(&db->db_mtx); 332 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 333 int blksz = db->db.db_size; 334 spa_t *spa = db->db_objset->os_spa; 335 336 mutex_exit(&db->db_mtx); 337 abuf = arc_loan_buf(spa, blksz); 338 bcopy(db->db.db_data, abuf->b_data, blksz); 339 } else { 340 abuf = db->db_buf; 341 arc_loan_inuse_buf(abuf, db); 342 dbuf_set_data(db, NULL); 343 mutex_exit(&db->db_mtx); 344 } 345 return (abuf); 346 } 347 348 uint64_t 349 dbuf_whichblock(dnode_t *dn, uint64_t offset) 350 { 351 if (dn->dn_datablkshift) { 352 return (offset >> dn->dn_datablkshift); 353 } else { 354 ASSERT3U(offset, <, dn->dn_datablksz); 355 return (0); 356 } 357 } 358 359 static void 360 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 361 { 362 dmu_buf_impl_t *db = vdb; 363 364 mutex_enter(&db->db_mtx); 365 ASSERT3U(db->db_state, ==, DB_READ); 366 /* 367 * All reads are synchronous, so we must have a hold on the dbuf 368 */ 369 ASSERT(refcount_count(&db->db_holds) > 0); 370 ASSERT(db->db_buf == NULL); 371 ASSERT(db->db.db_data == NULL); 372 if (db->db_level == 0 && db->db_freed_in_flight) { 373 /* we were freed in flight; disregard any error */ 374 arc_release(buf, db); 375 bzero(buf->b_data, db->db.db_size); 376 arc_buf_freeze(buf); 377 db->db_freed_in_flight = FALSE; 378 dbuf_set_data(db, buf); 379 db->db_state = DB_CACHED; 380 } else if (zio == NULL || zio->io_error == 0) { 381 dbuf_set_data(db, buf); 382 db->db_state = DB_CACHED; 383 } else { 384 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 385 ASSERT3P(db->db_buf, ==, NULL); 386 VERIFY(arc_buf_remove_ref(buf, db)); 387 db->db_state = DB_UNCACHED; 388 } 389 cv_broadcast(&db->db_changed); 390 dbuf_rele_and_unlock(db, NULL); 391 } 392 393 static void 394 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 395 { 396 dnode_t *dn; 397 zbookmark_phys_t zb; 398 uint32_t aflags = ARC_NOWAIT; 399 400 DB_DNODE_ENTER(db); 401 dn = DB_DNODE(db); 402 ASSERT(!refcount_is_zero(&db->db_holds)); 403 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 404 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 405 ASSERT(MUTEX_HELD(&db->db_mtx)); 406 ASSERT(db->db_state == DB_UNCACHED); 407 ASSERT(db->db_buf == NULL); 408 409 if (db->db_blkid == DMU_BONUS_BLKID) { 410 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 411 412 ASSERT3U(bonuslen, <=, db->db.db_size); 413 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 414 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 415 if (bonuslen < DN_MAX_BONUSLEN) 416 bzero(db->db.db_data, DN_MAX_BONUSLEN); 417 if (bonuslen) 418 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 419 DB_DNODE_EXIT(db); 420 dbuf_update_data(db); 421 db->db_state = DB_CACHED; 422 mutex_exit(&db->db_mtx); 423 return; 424 } 425 426 /* 427 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 428 * processes the delete record and clears the bp while we are waiting 429 * for the dn_mtx (resulting in a "no" from block_freed). 430 */ 431 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 432 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 433 BP_IS_HOLE(db->db_blkptr)))) { 434 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 435 436 DB_DNODE_EXIT(db); 437 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 438 db->db.db_size, db, type)); 439 bzero(db->db.db_data, db->db.db_size); 440 db->db_state = DB_CACHED; 441 *flags |= DB_RF_CACHED; 442 mutex_exit(&db->db_mtx); 443 return; 444 } 445 446 DB_DNODE_EXIT(db); 447 448 db->db_state = DB_READ; 449 mutex_exit(&db->db_mtx); 450 451 if (DBUF_IS_L2CACHEABLE(db)) 452 aflags |= ARC_L2CACHE; 453 if (DBUF_IS_L2COMPRESSIBLE(db)) 454 aflags |= ARC_L2COMPRESS; 455 456 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 457 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 458 db->db.db_object, db->db_level, db->db_blkid); 459 460 dbuf_add_ref(db, NULL); 461 462 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 463 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 464 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 465 &aflags, &zb); 466 if (aflags & ARC_CACHED) 467 *flags |= DB_RF_CACHED; 468 } 469 470 int 471 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 472 { 473 int err = 0; 474 boolean_t havepzio = (zio != NULL); 475 boolean_t prefetch; 476 dnode_t *dn; 477 478 /* 479 * We don't have to hold the mutex to check db_state because it 480 * can't be freed while we have a hold on the buffer. 481 */ 482 ASSERT(!refcount_is_zero(&db->db_holds)); 483 484 if (db->db_state == DB_NOFILL) 485 return (SET_ERROR(EIO)); 486 487 DB_DNODE_ENTER(db); 488 dn = DB_DNODE(db); 489 if ((flags & DB_RF_HAVESTRUCT) == 0) 490 rw_enter(&dn->dn_struct_rwlock, RW_READER); 491 492 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 493 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 494 DBUF_IS_CACHEABLE(db); 495 496 mutex_enter(&db->db_mtx); 497 if (db->db_state == DB_CACHED) { 498 mutex_exit(&db->db_mtx); 499 if (prefetch) 500 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 501 db->db.db_size, TRUE); 502 if ((flags & DB_RF_HAVESTRUCT) == 0) 503 rw_exit(&dn->dn_struct_rwlock); 504 DB_DNODE_EXIT(db); 505 } else if (db->db_state == DB_UNCACHED) { 506 spa_t *spa = dn->dn_objset->os_spa; 507 508 if (zio == NULL) 509 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 510 dbuf_read_impl(db, zio, &flags); 511 512 /* dbuf_read_impl has dropped db_mtx for us */ 513 514 if (prefetch) 515 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 516 db->db.db_size, flags & DB_RF_CACHED); 517 518 if ((flags & DB_RF_HAVESTRUCT) == 0) 519 rw_exit(&dn->dn_struct_rwlock); 520 DB_DNODE_EXIT(db); 521 522 if (!havepzio) 523 err = zio_wait(zio); 524 } else { 525 /* 526 * Another reader came in while the dbuf was in flight 527 * between UNCACHED and CACHED. Either a writer will finish 528 * writing the buffer (sending the dbuf to CACHED) or the 529 * first reader's request will reach the read_done callback 530 * and send the dbuf to CACHED. Otherwise, a failure 531 * occurred and the dbuf went to UNCACHED. 532 */ 533 mutex_exit(&db->db_mtx); 534 if (prefetch) 535 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 536 db->db.db_size, TRUE); 537 if ((flags & DB_RF_HAVESTRUCT) == 0) 538 rw_exit(&dn->dn_struct_rwlock); 539 DB_DNODE_EXIT(db); 540 541 /* Skip the wait per the caller's request. */ 542 mutex_enter(&db->db_mtx); 543 if ((flags & DB_RF_NEVERWAIT) == 0) { 544 while (db->db_state == DB_READ || 545 db->db_state == DB_FILL) { 546 ASSERT(db->db_state == DB_READ || 547 (flags & DB_RF_HAVESTRUCT) == 0); 548 cv_wait(&db->db_changed, &db->db_mtx); 549 } 550 if (db->db_state == DB_UNCACHED) 551 err = SET_ERROR(EIO); 552 } 553 mutex_exit(&db->db_mtx); 554 } 555 556 ASSERT(err || havepzio || db->db_state == DB_CACHED); 557 return (err); 558 } 559 560 static void 561 dbuf_noread(dmu_buf_impl_t *db) 562 { 563 ASSERT(!refcount_is_zero(&db->db_holds)); 564 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 565 mutex_enter(&db->db_mtx); 566 while (db->db_state == DB_READ || db->db_state == DB_FILL) 567 cv_wait(&db->db_changed, &db->db_mtx); 568 if (db->db_state == DB_UNCACHED) { 569 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 570 spa_t *spa = db->db_objset->os_spa; 571 572 ASSERT(db->db_buf == NULL); 573 ASSERT(db->db.db_data == NULL); 574 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 575 db->db_state = DB_FILL; 576 } else if (db->db_state == DB_NOFILL) { 577 dbuf_set_data(db, NULL); 578 } else { 579 ASSERT3U(db->db_state, ==, DB_CACHED); 580 } 581 mutex_exit(&db->db_mtx); 582 } 583 584 /* 585 * This is our just-in-time copy function. It makes a copy of 586 * buffers, that have been modified in a previous transaction 587 * group, before we modify them in the current active group. 588 * 589 * This function is used in two places: when we are dirtying a 590 * buffer for the first time in a txg, and when we are freeing 591 * a range in a dnode that includes this buffer. 592 * 593 * Note that when we are called from dbuf_free_range() we do 594 * not put a hold on the buffer, we just traverse the active 595 * dbuf list for the dnode. 596 */ 597 static void 598 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 599 { 600 dbuf_dirty_record_t *dr = db->db_last_dirty; 601 602 ASSERT(MUTEX_HELD(&db->db_mtx)); 603 ASSERT(db->db.db_data != NULL); 604 ASSERT(db->db_level == 0); 605 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 606 607 if (dr == NULL || 608 (dr->dt.dl.dr_data != 609 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 610 return; 611 612 /* 613 * If the last dirty record for this dbuf has not yet synced 614 * and its referencing the dbuf data, either: 615 * reset the reference to point to a new copy, 616 * or (if there a no active holders) 617 * just null out the current db_data pointer. 618 */ 619 ASSERT(dr->dr_txg >= txg - 2); 620 if (db->db_blkid == DMU_BONUS_BLKID) { 621 /* Note that the data bufs here are zio_bufs */ 622 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 623 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 624 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 625 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 626 int size = db->db.db_size; 627 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 628 spa_t *spa = db->db_objset->os_spa; 629 630 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 631 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 632 } else { 633 dbuf_set_data(db, NULL); 634 } 635 } 636 637 void 638 dbuf_unoverride(dbuf_dirty_record_t *dr) 639 { 640 dmu_buf_impl_t *db = dr->dr_dbuf; 641 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 642 uint64_t txg = dr->dr_txg; 643 644 ASSERT(MUTEX_HELD(&db->db_mtx)); 645 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 646 ASSERT(db->db_level == 0); 647 648 if (db->db_blkid == DMU_BONUS_BLKID || 649 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 650 return; 651 652 ASSERT(db->db_data_pending != dr); 653 654 /* free this block */ 655 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 656 zio_free(db->db_objset->os_spa, txg, bp); 657 658 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 659 dr->dt.dl.dr_nopwrite = B_FALSE; 660 661 /* 662 * Release the already-written buffer, so we leave it in 663 * a consistent dirty state. Note that all callers are 664 * modifying the buffer, so they will immediately do 665 * another (redundant) arc_release(). Therefore, leave 666 * the buf thawed to save the effort of freezing & 667 * immediately re-thawing it. 668 */ 669 arc_release(dr->dt.dl.dr_data, db); 670 } 671 672 /* 673 * Evict (if its unreferenced) or clear (if its referenced) any level-0 674 * data blocks in the free range, so that any future readers will find 675 * empty blocks. 676 * 677 * This is a no-op if the dataset is in the middle of an incremental 678 * receive; see comment below for details. 679 */ 680 void 681 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 682 dmu_tx_t *tx) 683 { 684 dmu_buf_impl_t *db, *db_next, db_search; 685 uint64_t txg = tx->tx_txg; 686 avl_index_t where; 687 688 if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) 689 end_blkid = dn->dn_maxblkid; 690 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 691 692 db_search.db_level = 0; 693 db_search.db_blkid = start_blkid; 694 db_search.db_state = DB_SEARCH; 695 696 mutex_enter(&dn->dn_dbufs_mtx); 697 if (start_blkid >= dn->dn_unlisted_l0_blkid) { 698 /* There can't be any dbufs in this range; no need to search. */ 699 #ifdef DEBUG 700 db = avl_find(&dn->dn_dbufs, &db_search, &where); 701 ASSERT3P(db, ==, NULL); 702 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 703 ASSERT(db == NULL || db->db_level > 0); 704 #endif 705 mutex_exit(&dn->dn_dbufs_mtx); 706 return; 707 } else if (dmu_objset_is_receiving(dn->dn_objset)) { 708 /* 709 * If we are receiving, we expect there to be no dbufs in 710 * the range to be freed, because receive modifies each 711 * block at most once, and in offset order. If this is 712 * not the case, it can lead to performance problems, 713 * so note that we unexpectedly took the slow path. 714 */ 715 atomic_inc_64(&zfs_free_range_recv_miss); 716 } 717 718 db = avl_find(&dn->dn_dbufs, &db_search, &where); 719 ASSERT3P(db, ==, NULL); 720 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 721 722 for (; db != NULL; db = db_next) { 723 db_next = AVL_NEXT(&dn->dn_dbufs, db); 724 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 725 726 if (db->db_level != 0 || db->db_blkid > end_blkid) { 727 break; 728 } 729 ASSERT3U(db->db_blkid, >=, start_blkid); 730 731 /* found a level 0 buffer in the range */ 732 mutex_enter(&db->db_mtx); 733 if (dbuf_undirty(db, tx)) { 734 /* mutex has been dropped and dbuf destroyed */ 735 continue; 736 } 737 738 if (db->db_state == DB_UNCACHED || 739 db->db_state == DB_NOFILL || 740 db->db_state == DB_EVICTING) { 741 ASSERT(db->db.db_data == NULL); 742 mutex_exit(&db->db_mtx); 743 continue; 744 } 745 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 746 /* will be handled in dbuf_read_done or dbuf_rele */ 747 db->db_freed_in_flight = TRUE; 748 mutex_exit(&db->db_mtx); 749 continue; 750 } 751 if (refcount_count(&db->db_holds) == 0) { 752 ASSERT(db->db_buf); 753 dbuf_clear(db); 754 continue; 755 } 756 /* The dbuf is referenced */ 757 758 if (db->db_last_dirty != NULL) { 759 dbuf_dirty_record_t *dr = db->db_last_dirty; 760 761 if (dr->dr_txg == txg) { 762 /* 763 * This buffer is "in-use", re-adjust the file 764 * size to reflect that this buffer may 765 * contain new data when we sync. 766 */ 767 if (db->db_blkid != DMU_SPILL_BLKID && 768 db->db_blkid > dn->dn_maxblkid) 769 dn->dn_maxblkid = db->db_blkid; 770 dbuf_unoverride(dr); 771 } else { 772 /* 773 * This dbuf is not dirty in the open context. 774 * Either uncache it (if its not referenced in 775 * the open context) or reset its contents to 776 * empty. 777 */ 778 dbuf_fix_old_data(db, txg); 779 } 780 } 781 /* clear the contents if its cached */ 782 if (db->db_state == DB_CACHED) { 783 ASSERT(db->db.db_data != NULL); 784 arc_release(db->db_buf, db); 785 bzero(db->db.db_data, db->db.db_size); 786 arc_buf_freeze(db->db_buf); 787 } 788 789 mutex_exit(&db->db_mtx); 790 } 791 mutex_exit(&dn->dn_dbufs_mtx); 792 } 793 794 static int 795 dbuf_block_freeable(dmu_buf_impl_t *db) 796 { 797 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 798 uint64_t birth_txg = 0; 799 800 /* 801 * We don't need any locking to protect db_blkptr: 802 * If it's syncing, then db_last_dirty will be set 803 * so we'll ignore db_blkptr. 804 * 805 * This logic ensures that only block births for 806 * filled blocks are considered. 807 */ 808 ASSERT(MUTEX_HELD(&db->db_mtx)); 809 if (db->db_last_dirty && (db->db_blkptr == NULL || 810 !BP_IS_HOLE(db->db_blkptr))) { 811 birth_txg = db->db_last_dirty->dr_txg; 812 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 813 birth_txg = db->db_blkptr->blk_birth; 814 } 815 816 /* 817 * If this block don't exist or is in a snapshot, it can't be freed. 818 * Don't pass the bp to dsl_dataset_block_freeable() since we 819 * are holding the db_mtx lock and might deadlock if we are 820 * prefetching a dedup-ed block. 821 */ 822 if (birth_txg != 0) 823 return (ds == NULL || 824 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 825 else 826 return (B_FALSE); 827 } 828 829 void 830 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 831 { 832 arc_buf_t *buf, *obuf; 833 int osize = db->db.db_size; 834 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 835 dnode_t *dn; 836 837 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 838 839 DB_DNODE_ENTER(db); 840 dn = DB_DNODE(db); 841 842 /* XXX does *this* func really need the lock? */ 843 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 844 845 /* 846 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 847 * is OK, because there can be no other references to the db 848 * when we are changing its size, so no concurrent DB_FILL can 849 * be happening. 850 */ 851 /* 852 * XXX we should be doing a dbuf_read, checking the return 853 * value and returning that up to our callers 854 */ 855 dmu_buf_will_dirty(&db->db, tx); 856 857 /* create the data buffer for the new block */ 858 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 859 860 /* copy old block data to the new block */ 861 obuf = db->db_buf; 862 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 863 /* zero the remainder */ 864 if (size > osize) 865 bzero((uint8_t *)buf->b_data + osize, size - osize); 866 867 mutex_enter(&db->db_mtx); 868 dbuf_set_data(db, buf); 869 VERIFY(arc_buf_remove_ref(obuf, db)); 870 db->db.db_size = size; 871 872 if (db->db_level == 0) { 873 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 874 db->db_last_dirty->dt.dl.dr_data = buf; 875 } 876 mutex_exit(&db->db_mtx); 877 878 dnode_willuse_space(dn, size-osize, tx); 879 DB_DNODE_EXIT(db); 880 } 881 882 void 883 dbuf_release_bp(dmu_buf_impl_t *db) 884 { 885 objset_t *os = db->db_objset; 886 887 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 888 ASSERT(arc_released(os->os_phys_buf) || 889 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 890 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 891 892 (void) arc_release(db->db_buf, db); 893 } 894 895 dbuf_dirty_record_t * 896 dbuf_dirty_sc(dmu_buf_impl_t *db, dmu_tx_t *tx, boolean_t usesc) 897 { 898 dnode_t *dn; 899 objset_t *os; 900 dbuf_dirty_record_t **drp, *dr; 901 int drop_struct_lock = FALSE; 902 boolean_t do_free_accounting = B_FALSE; 903 int txgoff = tx->tx_txg & TXG_MASK; 904 905 ASSERT(tx->tx_txg != 0); 906 ASSERT(!refcount_is_zero(&db->db_holds)); 907 DMU_TX_DIRTY_BUF(tx, db); 908 909 DB_DNODE_ENTER(db); 910 dn = DB_DNODE(db); 911 /* 912 * Shouldn't dirty a regular buffer in syncing context. Private 913 * objects may be dirtied in syncing context, but only if they 914 * were already pre-dirtied in open context. 915 */ 916 ASSERT(!dmu_tx_is_syncing(tx) || 917 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 918 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 919 dn->dn_objset->os_dsl_dataset == NULL); 920 /* 921 * We make this assert for private objects as well, but after we 922 * check if we're already dirty. They are allowed to re-dirty 923 * in syncing context. 924 */ 925 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 926 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 927 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 928 929 mutex_enter(&db->db_mtx); 930 /* 931 * XXX make this true for indirects too? The problem is that 932 * transactions created with dmu_tx_create_assigned() from 933 * syncing context don't bother holding ahead. 934 */ 935 ASSERT(db->db_level != 0 || 936 db->db_state == DB_CACHED || db->db_state == DB_FILL || 937 db->db_state == DB_NOFILL); 938 939 mutex_enter(&dn->dn_mtx); 940 /* 941 * Don't set dirtyctx to SYNC if we're just modifying this as we 942 * initialize the objset. 943 */ 944 if (dn->dn_dirtyctx == DN_UNDIRTIED && 945 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 946 dn->dn_dirtyctx = 947 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 948 ASSERT(dn->dn_dirtyctx_firstset == NULL); 949 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 950 } 951 mutex_exit(&dn->dn_mtx); 952 953 if (db->db_blkid == DMU_SPILL_BLKID) 954 dn->dn_have_spill = B_TRUE; 955 956 /* 957 * If this buffer is already dirty, we're done. 958 */ 959 drp = &db->db_last_dirty; 960 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 961 db->db.db_object == DMU_META_DNODE_OBJECT); 962 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 963 drp = &dr->dr_next; 964 if (dr && dr->dr_txg == tx->tx_txg) { 965 DB_DNODE_EXIT(db); 966 967 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 968 /* 969 * If this buffer has already been written out, 970 * we now need to reset its state. 971 */ 972 dbuf_unoverride(dr); 973 if (db->db.db_object != DMU_META_DNODE_OBJECT && 974 db->db_state != DB_NOFILL) 975 arc_buf_thaw(db->db_buf); 976 } 977 978 /* 979 * Special class usage of dirty dbuf could be changed, 980 * update the dirty entry. 981 */ 982 dr->dr_usesc = usesc; 983 mutex_exit(&db->db_mtx); 984 return (dr); 985 } 986 987 /* 988 * Only valid if not already dirty. 989 */ 990 ASSERT(dn->dn_object == 0 || 991 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 992 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 993 994 ASSERT3U(dn->dn_nlevels, >, db->db_level); 995 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 996 dn->dn_phys->dn_nlevels > db->db_level || 997 dn->dn_next_nlevels[txgoff] > db->db_level || 998 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 999 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1000 1001 /* 1002 * We should only be dirtying in syncing context if it's the 1003 * mos or we're initializing the os or it's a special object. 1004 * However, we are allowed to dirty in syncing context provided 1005 * we already dirtied it in open context. Hence we must make 1006 * this assertion only if we're not already dirty. 1007 */ 1008 os = dn->dn_objset; 1009 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1010 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1011 ASSERT(db->db.db_size != 0); 1012 1013 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1014 1015 if (db->db_blkid != DMU_BONUS_BLKID) { 1016 /* 1017 * Update the accounting. 1018 * Note: we delay "free accounting" until after we drop 1019 * the db_mtx. This keeps us from grabbing other locks 1020 * (and possibly deadlocking) in bp_get_dsize() while 1021 * also holding the db_mtx. 1022 */ 1023 dnode_willuse_space(dn, db->db.db_size, tx); 1024 do_free_accounting = dbuf_block_freeable(db); 1025 } 1026 1027 /* 1028 * If this buffer is dirty in an old transaction group we need 1029 * to make a copy of it so that the changes we make in this 1030 * transaction group won't leak out when we sync the older txg. 1031 */ 1032 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1033 if (db->db_level == 0) { 1034 void *data_old = db->db_buf; 1035 1036 if (db->db_state != DB_NOFILL) { 1037 if (db->db_blkid == DMU_BONUS_BLKID) { 1038 dbuf_fix_old_data(db, tx->tx_txg); 1039 data_old = db->db.db_data; 1040 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1041 /* 1042 * Release the data buffer from the cache so 1043 * that we can modify it without impacting 1044 * possible other users of this cached data 1045 * block. Note that indirect blocks and 1046 * private objects are not released until the 1047 * syncing state (since they are only modified 1048 * then). 1049 */ 1050 arc_release(db->db_buf, db); 1051 dbuf_fix_old_data(db, tx->tx_txg); 1052 data_old = db->db_buf; 1053 } 1054 ASSERT(data_old != NULL); 1055 } 1056 dr->dt.dl.dr_data = data_old; 1057 } else { 1058 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1059 list_create(&dr->dt.di.dr_children, 1060 sizeof (dbuf_dirty_record_t), 1061 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1062 } 1063 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1064 dr->dr_accounted = db->db.db_size; 1065 dr->dr_dbuf = db; 1066 dr->dr_txg = tx->tx_txg; 1067 dr->dr_next = *drp; 1068 dr->dr_usesc = usesc; 1069 *drp = dr; 1070 1071 /* 1072 * We could have been freed_in_flight between the dbuf_noread 1073 * and dbuf_dirty. We win, as though the dbuf_noread() had 1074 * happened after the free. 1075 */ 1076 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1077 db->db_blkid != DMU_SPILL_BLKID) { 1078 mutex_enter(&dn->dn_mtx); 1079 if (dn->dn_free_ranges[txgoff] != NULL) { 1080 range_tree_clear(dn->dn_free_ranges[txgoff], 1081 db->db_blkid, 1); 1082 } 1083 mutex_exit(&dn->dn_mtx); 1084 db->db_freed_in_flight = FALSE; 1085 } 1086 1087 /* 1088 * This buffer is now part of this txg 1089 */ 1090 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1091 db->db_dirtycnt += 1; 1092 ASSERT3U(db->db_dirtycnt, <=, 3); 1093 1094 mutex_exit(&db->db_mtx); 1095 1096 if (db->db_blkid == DMU_BONUS_BLKID || 1097 db->db_blkid == DMU_SPILL_BLKID) { 1098 mutex_enter(&dn->dn_mtx); 1099 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1100 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1101 mutex_exit(&dn->dn_mtx); 1102 dnode_setdirty_sc(dn, tx, usesc); 1103 DB_DNODE_EXIT(db); 1104 return (dr); 1105 } else if (do_free_accounting) { 1106 blkptr_t *bp = db->db_blkptr; 1107 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1108 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1109 /* 1110 * This is only a guess -- if the dbuf is dirty 1111 * in a previous txg, we don't know how much 1112 * space it will use on disk yet. We should 1113 * really have the struct_rwlock to access 1114 * db_blkptr, but since this is just a guess, 1115 * it's OK if we get an odd answer. 1116 */ 1117 ddt_prefetch(os->os_spa, bp); 1118 dnode_willuse_space(dn, -willfree, tx); 1119 } 1120 1121 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1122 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1123 drop_struct_lock = TRUE; 1124 } 1125 1126 if (db->db_level == 0) { 1127 dnode_new_blkid(dn, db->db_blkid, tx, usesc, drop_struct_lock); 1128 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1129 } 1130 1131 if (db->db_level+1 < dn->dn_nlevels) { 1132 dmu_buf_impl_t *parent = db->db_parent; 1133 dbuf_dirty_record_t *di; 1134 int parent_held = FALSE; 1135 1136 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1137 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1138 1139 parent = dbuf_hold_level(dn, db->db_level+1, 1140 db->db_blkid >> epbs, FTAG); 1141 ASSERT(parent != NULL); 1142 parent_held = TRUE; 1143 } 1144 if (drop_struct_lock) 1145 rw_exit(&dn->dn_struct_rwlock); 1146 ASSERT3U(db->db_level+1, ==, parent->db_level); 1147 di = dbuf_dirty_sc(parent, tx, usesc); 1148 if (parent_held) 1149 dbuf_rele(parent, FTAG); 1150 1151 mutex_enter(&db->db_mtx); 1152 /* 1153 * Since we've dropped the mutex, it's possible that 1154 * dbuf_undirty() might have changed this out from under us. 1155 */ 1156 if (db->db_last_dirty == dr || 1157 dn->dn_object == DMU_META_DNODE_OBJECT) { 1158 mutex_enter(&di->dt.di.dr_mtx); 1159 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1160 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1161 list_insert_tail(&di->dt.di.dr_children, dr); 1162 mutex_exit(&di->dt.di.dr_mtx); 1163 dr->dr_parent = di; 1164 } 1165 1166 /* 1167 * Special class usage of dirty dbuf could be changed, 1168 * update the dirty entry. 1169 */ 1170 dr->dr_usesc = usesc; 1171 mutex_exit(&db->db_mtx); 1172 } else { 1173 ASSERT(db->db_level+1 == dn->dn_nlevels); 1174 ASSERT(db->db_blkid < dn->dn_nblkptr); 1175 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1176 mutex_enter(&dn->dn_mtx); 1177 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1178 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1179 mutex_exit(&dn->dn_mtx); 1180 if (drop_struct_lock) 1181 rw_exit(&dn->dn_struct_rwlock); 1182 } 1183 1184 dnode_setdirty_sc(dn, tx, usesc); 1185 DB_DNODE_EXIT(db); 1186 return (dr); 1187 } 1188 1189 dbuf_dirty_record_t * 1190 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1191 { 1192 spa_t *spa; 1193 1194 ASSERT(db->db_objset != NULL); 1195 spa = db->db_objset->os_spa; 1196 1197 return (dbuf_dirty_sc(db, tx, spa->spa_usesc)); 1198 } 1199 1200 /* 1201 * Undirty a buffer in the transaction group referenced by the given 1202 * transaction. Return whether this evicted the dbuf. 1203 */ 1204 static boolean_t 1205 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1206 { 1207 dnode_t *dn; 1208 uint64_t txg = tx->tx_txg; 1209 dbuf_dirty_record_t *dr, **drp; 1210 1211 ASSERT(txg != 0); 1212 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1213 ASSERT0(db->db_level); 1214 ASSERT(MUTEX_HELD(&db->db_mtx)); 1215 1216 /* 1217 * If this buffer is not dirty, we're done. 1218 */ 1219 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1220 if (dr->dr_txg <= txg) 1221 break; 1222 if (dr == NULL || dr->dr_txg < txg) 1223 return (B_FALSE); 1224 ASSERT(dr->dr_txg == txg); 1225 ASSERT(dr->dr_dbuf == db); 1226 1227 DB_DNODE_ENTER(db); 1228 dn = DB_DNODE(db); 1229 1230 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1231 1232 ASSERT(db->db.db_size != 0); 1233 1234 /* 1235 * Any space we accounted for in dp_dirty_* will be cleaned up by 1236 * dsl_pool_sync(). This is relatively rare so the discrepancy 1237 * is not a big deal. 1238 */ 1239 1240 *drp = dr->dr_next; 1241 1242 /* 1243 * Note that there are three places in dbuf_dirty() 1244 * where this dirty record may be put on a list. 1245 * Make sure to do a list_remove corresponding to 1246 * every one of those list_insert calls. 1247 */ 1248 if (dr->dr_parent) { 1249 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1250 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1251 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1252 } else if (db->db_blkid == DMU_SPILL_BLKID || 1253 db->db_level+1 == dn->dn_nlevels) { 1254 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1255 mutex_enter(&dn->dn_mtx); 1256 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1257 mutex_exit(&dn->dn_mtx); 1258 } 1259 DB_DNODE_EXIT(db); 1260 1261 if (db->db_state != DB_NOFILL) { 1262 dbuf_unoverride(dr); 1263 1264 ASSERT(db->db_buf != NULL); 1265 ASSERT(dr->dt.dl.dr_data != NULL); 1266 if (dr->dt.dl.dr_data != db->db_buf) 1267 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1268 } 1269 1270 if (db->db_level != 0) { 1271 mutex_destroy(&dr->dt.di.dr_mtx); 1272 list_destroy(&dr->dt.di.dr_children); 1273 } 1274 1275 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1276 1277 ASSERT(db->db_dirtycnt > 0); 1278 db->db_dirtycnt -= 1; 1279 1280 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1281 arc_buf_t *buf = db->db_buf; 1282 1283 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1284 dbuf_set_data(db, NULL); 1285 VERIFY(arc_buf_remove_ref(buf, db)); 1286 dbuf_evict(db); 1287 return (B_TRUE); 1288 } 1289 1290 return (B_FALSE); 1291 } 1292 1293 void 1294 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1295 { 1296 dmu_buf_will_dirty_sc(db_fake, tx, B_TRUE); 1297 } 1298 1299 void 1300 dmu_buf_will_dirty_sc(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t usesc) 1301 { 1302 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1303 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1304 1305 ASSERT(tx->tx_txg != 0); 1306 ASSERT(!refcount_is_zero(&db->db_holds)); 1307 1308 DB_DNODE_ENTER(db); 1309 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1310 rf |= DB_RF_HAVESTRUCT; 1311 DB_DNODE_EXIT(db); 1312 (void) dbuf_read(db, NULL, rf); 1313 (void) dbuf_dirty_sc(db, tx, usesc); 1314 } 1315 1316 1317 void 1318 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1319 { 1320 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1321 1322 db->db_state = DB_NOFILL; 1323 1324 dmu_buf_will_fill(db_fake, tx); 1325 } 1326 1327 void 1328 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1329 { 1330 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1331 1332 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1333 ASSERT(tx->tx_txg != 0); 1334 ASSERT(db->db_level == 0); 1335 ASSERT(!refcount_is_zero(&db->db_holds)); 1336 1337 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1338 dmu_tx_private_ok(tx)); 1339 1340 dbuf_noread(db); 1341 (void) dbuf_dirty(db, tx); 1342 } 1343 1344 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1345 /* ARGSUSED */ 1346 void 1347 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1348 { 1349 mutex_enter(&db->db_mtx); 1350 DBUF_VERIFY(db); 1351 1352 if (db->db_state == DB_FILL) { 1353 if (db->db_level == 0 && db->db_freed_in_flight) { 1354 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1355 /* we were freed while filling */ 1356 /* XXX dbuf_undirty? */ 1357 bzero(db->db.db_data, db->db.db_size); 1358 db->db_freed_in_flight = FALSE; 1359 } 1360 db->db_state = DB_CACHED; 1361 cv_broadcast(&db->db_changed); 1362 } 1363 mutex_exit(&db->db_mtx); 1364 } 1365 1366 void 1367 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 1368 bp_embedded_type_t etype, enum zio_compress comp, 1369 int uncompressed_size, int compressed_size, int byteorder, 1370 dmu_tx_t *tx) 1371 { 1372 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1373 struct dirty_leaf *dl; 1374 dmu_object_type_t type; 1375 1376 DB_DNODE_ENTER(db); 1377 type = DB_DNODE(db)->dn_type; 1378 DB_DNODE_EXIT(db); 1379 1380 ASSERT0(db->db_level); 1381 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1382 1383 dmu_buf_will_not_fill(dbuf, tx); 1384 1385 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1386 dl = &db->db_last_dirty->dt.dl; 1387 encode_embedded_bp_compressed(&dl->dr_overridden_by, 1388 data, comp, uncompressed_size, compressed_size); 1389 BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1390 BP_SET_TYPE(&dl->dr_overridden_by, type); 1391 BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1392 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1393 1394 dl->dr_override_state = DR_OVERRIDDEN; 1395 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1396 } 1397 1398 /* 1399 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1400 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1401 */ 1402 void 1403 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1404 { 1405 ASSERT(!refcount_is_zero(&db->db_holds)); 1406 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1407 ASSERT(db->db_level == 0); 1408 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1409 ASSERT(buf != NULL); 1410 ASSERT(arc_buf_size(buf) == db->db.db_size); 1411 ASSERT(tx->tx_txg != 0); 1412 1413 arc_return_buf(buf, db); 1414 ASSERT(arc_released(buf)); 1415 1416 mutex_enter(&db->db_mtx); 1417 1418 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1419 cv_wait(&db->db_changed, &db->db_mtx); 1420 1421 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1422 1423 if (db->db_state == DB_CACHED && 1424 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1425 mutex_exit(&db->db_mtx); 1426 (void) dbuf_dirty(db, tx); 1427 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1428 VERIFY(arc_buf_remove_ref(buf, db)); 1429 xuio_stat_wbuf_copied(); 1430 return; 1431 } 1432 1433 xuio_stat_wbuf_nocopy(); 1434 if (db->db_state == DB_CACHED) { 1435 dbuf_dirty_record_t *dr = db->db_last_dirty; 1436 1437 ASSERT(db->db_buf != NULL); 1438 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1439 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1440 if (!arc_released(db->db_buf)) { 1441 ASSERT(dr->dt.dl.dr_override_state == 1442 DR_OVERRIDDEN); 1443 arc_release(db->db_buf, db); 1444 } 1445 dr->dt.dl.dr_data = buf; 1446 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1447 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1448 arc_release(db->db_buf, db); 1449 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1450 } 1451 db->db_buf = NULL; 1452 } 1453 ASSERT(db->db_buf == NULL); 1454 dbuf_set_data(db, buf); 1455 db->db_state = DB_FILL; 1456 mutex_exit(&db->db_mtx); 1457 (void) dbuf_dirty(db, tx); 1458 dmu_buf_fill_done(&db->db, tx); 1459 } 1460 1461 /* 1462 * "Clear" the contents of this dbuf. This will mark the dbuf 1463 * EVICTING and clear *most* of its references. Unfortunately, 1464 * when we are not holding the dn_dbufs_mtx, we can't clear the 1465 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1466 * in this case. For callers from the DMU we will usually see: 1467 * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() 1468 * For the arc callback, we will usually see: 1469 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1470 * Sometimes, though, we will get a mix of these two: 1471 * DMU: dbuf_clear()->arc_clear_callback() 1472 * ARC: dbuf_do_evict()->dbuf_destroy() 1473 * 1474 * This routine will dissociate the dbuf from the arc, by calling 1475 * arc_clear_callback(), but will not evict the data from the ARC. 1476 */ 1477 void 1478 dbuf_clear(dmu_buf_impl_t *db) 1479 { 1480 dnode_t *dn; 1481 dmu_buf_impl_t *parent = db->db_parent; 1482 dmu_buf_impl_t *dndb; 1483 boolean_t dbuf_gone = B_FALSE; 1484 1485 ASSERT(MUTEX_HELD(&db->db_mtx)); 1486 ASSERT(refcount_is_zero(&db->db_holds)); 1487 1488 dbuf_evict_user(db); 1489 1490 if (db->db_state == DB_CACHED) { 1491 ASSERT(db->db.db_data != NULL); 1492 if (db->db_blkid == DMU_BONUS_BLKID) { 1493 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1494 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1495 } 1496 db->db.db_data = NULL; 1497 db->db_state = DB_UNCACHED; 1498 } 1499 1500 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1501 ASSERT(db->db_data_pending == NULL); 1502 1503 db->db_state = DB_EVICTING; 1504 db->db_blkptr = NULL; 1505 1506 DB_DNODE_ENTER(db); 1507 dn = DB_DNODE(db); 1508 dndb = dn->dn_dbuf; 1509 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1510 avl_remove(&dn->dn_dbufs, db); 1511 atomic_dec_32(&dn->dn_dbufs_count); 1512 membar_producer(); 1513 DB_DNODE_EXIT(db); 1514 /* 1515 * Decrementing the dbuf count means that the hold corresponding 1516 * to the removed dbuf is no longer discounted in dnode_move(), 1517 * so the dnode cannot be moved until after we release the hold. 1518 * The membar_producer() ensures visibility of the decremented 1519 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1520 * release any lock. 1521 */ 1522 dnode_rele(dn, db); 1523 db->db_dnode_handle = NULL; 1524 } else { 1525 DB_DNODE_EXIT(db); 1526 } 1527 1528 if (db->db_buf) 1529 dbuf_gone = arc_clear_callback(db->db_buf); 1530 1531 if (!dbuf_gone) 1532 mutex_exit(&db->db_mtx); 1533 1534 /* 1535 * If this dbuf is referenced from an indirect dbuf, 1536 * decrement the ref count on the indirect dbuf. 1537 */ 1538 if (parent && parent != dndb) 1539 dbuf_rele(parent, db); 1540 } 1541 1542 static int 1543 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1544 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1545 { 1546 int nlevels, epbs; 1547 1548 *parentp = NULL; 1549 *bpp = NULL; 1550 1551 ASSERT(blkid != DMU_BONUS_BLKID); 1552 1553 if (blkid == DMU_SPILL_BLKID) { 1554 mutex_enter(&dn->dn_mtx); 1555 if (dn->dn_have_spill && 1556 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1557 *bpp = &dn->dn_phys->dn_spill; 1558 else 1559 *bpp = NULL; 1560 dbuf_add_ref(dn->dn_dbuf, NULL); 1561 *parentp = dn->dn_dbuf; 1562 mutex_exit(&dn->dn_mtx); 1563 return (0); 1564 } 1565 1566 if (dn->dn_phys->dn_nlevels == 0) 1567 nlevels = 1; 1568 else 1569 nlevels = dn->dn_phys->dn_nlevels; 1570 1571 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1572 1573 ASSERT3U(level * epbs, <, 64); 1574 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1575 if (level >= nlevels || 1576 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1577 /* the buffer has no parent yet */ 1578 return (SET_ERROR(ENOENT)); 1579 } else if (level < nlevels-1) { 1580 /* this block is referenced from an indirect block */ 1581 int err = dbuf_hold_impl(dn, level+1, 1582 blkid >> epbs, fail_sparse, NULL, parentp); 1583 if (err) 1584 return (err); 1585 err = dbuf_read(*parentp, NULL, 1586 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1587 if (err) { 1588 dbuf_rele(*parentp, NULL); 1589 *parentp = NULL; 1590 return (err); 1591 } 1592 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1593 (blkid & ((1ULL << epbs) - 1)); 1594 return (0); 1595 } else { 1596 /* the block is referenced from the dnode */ 1597 ASSERT3U(level, ==, nlevels-1); 1598 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1599 blkid < dn->dn_phys->dn_nblkptr); 1600 if (dn->dn_dbuf) { 1601 dbuf_add_ref(dn->dn_dbuf, NULL); 1602 *parentp = dn->dn_dbuf; 1603 } 1604 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1605 return (0); 1606 } 1607 } 1608 1609 static dmu_buf_impl_t * 1610 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1611 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1612 { 1613 objset_t *os = dn->dn_objset; 1614 dmu_buf_impl_t *db, *odb; 1615 avl_index_t where; 1616 1617 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1618 ASSERT(dn->dn_type != DMU_OT_NONE); 1619 1620 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1621 1622 db->db_objset = os; 1623 db->db.db_object = dn->dn_object; 1624 db->db_level = level; 1625 db->db_blkid = blkid; 1626 db->db_last_dirty = NULL; 1627 db->db_dirtycnt = 0; 1628 db->db_dnode_handle = dn->dn_handle; 1629 db->db_parent = parent; 1630 db->db_blkptr = blkptr; 1631 1632 db->db_user_ptr = NULL; 1633 db->db_user_data_ptr_ptr = NULL; 1634 db->db_evict_func = NULL; 1635 db->db_immediate_evict = 0; 1636 db->db_freed_in_flight = 0; 1637 1638 if (blkid == DMU_BONUS_BLKID) { 1639 ASSERT3P(parent, ==, dn->dn_dbuf); 1640 db->db.db_size = DN_MAX_BONUSLEN - 1641 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1642 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1643 db->db.db_offset = DMU_BONUS_BLKID; 1644 db->db_state = DB_UNCACHED; 1645 /* the bonus dbuf is not placed into the dnode's dbuf tree */ 1646 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1647 return (db); 1648 } else if (blkid == DMU_SPILL_BLKID) { 1649 db->db.db_size = (blkptr != NULL) ? 1650 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1651 db->db.db_offset = 0; 1652 } else { 1653 int blocksize = 1654 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1655 db->db.db_size = blocksize; 1656 db->db.db_offset = db->db_blkid * blocksize; 1657 } 1658 1659 mutex_enter(&dn->dn_dbufs_mtx); 1660 mutex_enter(&db->db_mtx); 1661 db->db_state = DB_EVICTING; 1662 if ((odb = avl_find(&dn->dn_dbufs, db, &where))) { 1663 /* someone else inserted it first */ 1664 mutex_exit(&db->db_mtx); 1665 kmem_cache_free(dbuf_cache, db); 1666 mutex_enter(&odb->db_mtx); 1667 mutex_exit(&dn->dn_dbufs_mtx); 1668 return (odb); 1669 } 1670 avl_insert(&dn->dn_dbufs, db, where); 1671 if (db->db_level == 0 && db->db_blkid >= 1672 dn->dn_unlisted_l0_blkid) 1673 dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1674 db->db_state = DB_UNCACHED; 1675 mutex_exit(&dn->dn_dbufs_mtx); 1676 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1677 1678 if (parent && parent != dn->dn_dbuf) 1679 dbuf_add_ref(parent, db); 1680 1681 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1682 refcount_count(&dn->dn_holds) > 0); 1683 (void) refcount_add(&dn->dn_holds, db); 1684 atomic_inc_32(&dn->dn_dbufs_count); 1685 1686 dprintf_dbuf(db, "db=%p\n", db); 1687 1688 return (db); 1689 } 1690 1691 static int 1692 dbuf_do_evict(void *private) 1693 { 1694 dmu_buf_impl_t *db = private; 1695 1696 if (!MUTEX_HELD(&db->db_mtx)) 1697 mutex_enter(&db->db_mtx); 1698 1699 ASSERT(refcount_is_zero(&db->db_holds)); 1700 1701 if (db->db_state != DB_EVICTING) { 1702 ASSERT(db->db_state == DB_CACHED); 1703 DBUF_VERIFY(db); 1704 db->db_buf = NULL; 1705 dbuf_evict(db); 1706 } else { 1707 mutex_exit(&db->db_mtx); 1708 dbuf_destroy(db); 1709 } 1710 return (0); 1711 } 1712 1713 static void 1714 dbuf_destroy(dmu_buf_impl_t *db) 1715 { 1716 ASSERT(refcount_is_zero(&db->db_holds)); 1717 1718 if (db->db_blkid != DMU_BONUS_BLKID) { 1719 /* 1720 * If this dbuf is still on the dn_dbufs list, 1721 * remove it from that list. 1722 */ 1723 if (db->db_dnode_handle != NULL) { 1724 dnode_t *dn; 1725 1726 DB_DNODE_ENTER(db); 1727 dn = DB_DNODE(db); 1728 mutex_enter(&dn->dn_dbufs_mtx); 1729 avl_remove(&dn->dn_dbufs, db); 1730 atomic_dec_32(&dn->dn_dbufs_count); 1731 mutex_exit(&dn->dn_dbufs_mtx); 1732 DB_DNODE_EXIT(db); 1733 /* 1734 * Decrementing the dbuf count means that the hold 1735 * corresponding to the removed dbuf is no longer 1736 * discounted in dnode_move(), so the dnode cannot be 1737 * moved until after we release the hold. 1738 */ 1739 dnode_rele(dn, db); 1740 db->db_dnode_handle = NULL; 1741 } 1742 } 1743 db->db_parent = NULL; 1744 db->db_buf = NULL; 1745 1746 ASSERT(db->db.db_data == NULL); 1747 ASSERT(db->db_blkptr == NULL); 1748 ASSERT(db->db_data_pending == NULL); 1749 1750 kmem_cache_free(dbuf_cache, db); 1751 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1752 } 1753 1754 void 1755 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) 1756 { 1757 dmu_buf_impl_t *db = NULL; 1758 blkptr_t *bp = NULL; 1759 1760 ASSERT(blkid != DMU_BONUS_BLKID); 1761 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1762 1763 if (dnode_block_freed(dn, blkid)) 1764 return; 1765 1766 /* dbuf_find() returns with db_mtx held */ 1767 if (db = dbuf_find(dn, 0, blkid)) { 1768 /* 1769 * This dbuf is already in the cache. We assume that 1770 * it is already CACHED, or else about to be either 1771 * read or filled. 1772 */ 1773 mutex_exit(&db->db_mtx); 1774 return; 1775 } 1776 1777 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1778 if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { 1779 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1780 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1781 zbookmark_phys_t zb; 1782 1783 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1784 dn->dn_object, 0, blkid); 1785 1786 (void) arc_read(NULL, dn->dn_objset->os_spa, 1787 bp, NULL, NULL, prio, 1788 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1789 &aflags, &zb); 1790 } 1791 if (db) 1792 dbuf_rele(db, NULL); 1793 } 1794 } 1795 1796 /* 1797 * Returns with db_holds incremented, and db_mtx not held. 1798 * Note: dn_struct_rwlock must be held. 1799 */ 1800 int 1801 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1802 void *tag, dmu_buf_impl_t **dbp) 1803 { 1804 dmu_buf_impl_t *db, *parent = NULL; 1805 1806 ASSERT(blkid != DMU_BONUS_BLKID); 1807 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1808 ASSERT3U(dn->dn_nlevels, >, level); 1809 1810 *dbp = NULL; 1811 top: 1812 /* dbuf_find() returns with db_mtx held */ 1813 db = dbuf_find(dn, level, blkid); 1814 1815 if (db == NULL) { 1816 blkptr_t *bp = NULL; 1817 int err; 1818 1819 ASSERT3P(parent, ==, NULL); 1820 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1821 if (fail_sparse) { 1822 if (err == 0 && bp && BP_IS_HOLE(bp)) 1823 err = SET_ERROR(ENOENT); 1824 if (err) { 1825 if (parent) 1826 dbuf_rele(parent, NULL); 1827 return (err); 1828 } 1829 } 1830 if (err && err != ENOENT) 1831 return (err); 1832 db = dbuf_create(dn, level, blkid, parent, bp); 1833 } 1834 1835 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1836 arc_buf_add_ref(db->db_buf, db); 1837 if (db->db_buf->b_data == NULL) { 1838 dbuf_clear(db); 1839 if (parent) { 1840 dbuf_rele(parent, NULL); 1841 parent = NULL; 1842 } 1843 goto top; 1844 } 1845 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1846 } 1847 1848 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1849 1850 /* 1851 * If this buffer is currently syncing out, and we are are 1852 * still referencing it from db_data, we need to make a copy 1853 * of it in case we decide we want to dirty it again in this txg. 1854 */ 1855 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1856 dn->dn_object != DMU_META_DNODE_OBJECT && 1857 db->db_state == DB_CACHED && db->db_data_pending) { 1858 dbuf_dirty_record_t *dr = db->db_data_pending; 1859 1860 if (dr->dt.dl.dr_data == db->db_buf) { 1861 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1862 1863 dbuf_set_data(db, 1864 arc_buf_alloc(dn->dn_objset->os_spa, 1865 db->db.db_size, db, type)); 1866 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1867 db->db.db_size); 1868 } 1869 } 1870 1871 (void) refcount_add(&db->db_holds, tag); 1872 dbuf_update_data(db); 1873 DBUF_VERIFY(db); 1874 mutex_exit(&db->db_mtx); 1875 1876 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1877 if (parent) 1878 dbuf_rele(parent, NULL); 1879 1880 ASSERT3P(DB_DNODE(db), ==, dn); 1881 ASSERT3U(db->db_blkid, ==, blkid); 1882 ASSERT3U(db->db_level, ==, level); 1883 *dbp = db; 1884 1885 return (0); 1886 } 1887 1888 dmu_buf_impl_t * 1889 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1890 { 1891 dmu_buf_impl_t *db; 1892 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1893 return (err ? NULL : db); 1894 } 1895 1896 dmu_buf_impl_t * 1897 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1898 { 1899 dmu_buf_impl_t *db; 1900 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1901 return (err ? NULL : db); 1902 } 1903 1904 void 1905 dbuf_create_bonus(dnode_t *dn) 1906 { 1907 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1908 1909 ASSERT(dn->dn_bonus == NULL); 1910 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 1911 } 1912 1913 int 1914 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 1915 { 1916 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1917 dnode_t *dn; 1918 1919 if (db->db_blkid != DMU_SPILL_BLKID) 1920 return (SET_ERROR(ENOTSUP)); 1921 if (blksz == 0) 1922 blksz = SPA_MINBLOCKSIZE; 1923 if (blksz > SPA_MAXBLOCKSIZE) 1924 blksz = SPA_MAXBLOCKSIZE; 1925 else 1926 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 1927 1928 DB_DNODE_ENTER(db); 1929 dn = DB_DNODE(db); 1930 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 1931 dbuf_new_size(db, blksz, tx); 1932 rw_exit(&dn->dn_struct_rwlock); 1933 DB_DNODE_EXIT(db); 1934 1935 return (0); 1936 } 1937 1938 void 1939 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 1940 { 1941 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 1942 } 1943 1944 #pragma weak dmu_buf_add_ref = dbuf_add_ref 1945 void 1946 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1947 { 1948 int64_t holds = refcount_add(&db->db_holds, tag); 1949 ASSERT(holds > 1); 1950 } 1951 1952 /* 1953 * If you call dbuf_rele() you had better not be referencing the dnode handle 1954 * unless you have some other direct or indirect hold on the dnode. (An indirect 1955 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 1956 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 1957 * dnode's parent dbuf evicting its dnode handles. 1958 */ 1959 void 1960 dbuf_rele(dmu_buf_impl_t *db, void *tag) 1961 { 1962 mutex_enter(&db->db_mtx); 1963 dbuf_rele_and_unlock(db, tag); 1964 } 1965 1966 void 1967 dmu_buf_rele(dmu_buf_t *db, void *tag) 1968 { 1969 dbuf_rele((dmu_buf_impl_t *)db, tag); 1970 } 1971 1972 /* 1973 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 1974 * db_dirtycnt and db_holds to be updated atomically. 1975 */ 1976 void 1977 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 1978 { 1979 int64_t holds; 1980 1981 ASSERT(MUTEX_HELD(&db->db_mtx)); 1982 DBUF_VERIFY(db); 1983 1984 /* 1985 * Remove the reference to the dbuf before removing its hold on the 1986 * dnode so we can guarantee in dnode_move() that a referenced bonus 1987 * buffer has a corresponding dnode hold. 1988 */ 1989 holds = refcount_remove(&db->db_holds, tag); 1990 ASSERT(holds >= 0); 1991 1992 /* 1993 * We can't freeze indirects if there is a possibility that they 1994 * may be modified in the current syncing context. 1995 */ 1996 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 1997 arc_buf_freeze(db->db_buf); 1998 1999 if (holds == db->db_dirtycnt && 2000 db->db_level == 0 && db->db_immediate_evict) 2001 dbuf_evict_user(db); 2002 2003 if (holds == 0) { 2004 if (db->db_blkid == DMU_BONUS_BLKID) { 2005 mutex_exit(&db->db_mtx); 2006 2007 /* 2008 * If the dnode moves here, we cannot cross this barrier 2009 * until the move completes. 2010 */ 2011 DB_DNODE_ENTER(db); 2012 atomic_dec_32(&DB_DNODE(db)->dn_dbufs_count); 2013 DB_DNODE_EXIT(db); 2014 /* 2015 * The bonus buffer's dnode hold is no longer discounted 2016 * in dnode_move(). The dnode cannot move until after 2017 * the dnode_rele(). 2018 */ 2019 dnode_rele(DB_DNODE(db), db); 2020 } else if (db->db_buf == NULL) { 2021 /* 2022 * This is a special case: we never associated this 2023 * dbuf with any data allocated from the ARC. 2024 */ 2025 ASSERT(db->db_state == DB_UNCACHED || 2026 db->db_state == DB_NOFILL); 2027 dbuf_evict(db); 2028 } else if (arc_released(db->db_buf)) { 2029 arc_buf_t *buf = db->db_buf; 2030 /* 2031 * This dbuf has anonymous data associated with it. 2032 */ 2033 dbuf_set_data(db, NULL); 2034 VERIFY(arc_buf_remove_ref(buf, db)); 2035 dbuf_evict(db); 2036 } else { 2037 VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2038 2039 /* 2040 * A dbuf will be eligible for eviction if either the 2041 * 'primarycache' property is set or a duplicate 2042 * copy of this buffer is already cached in the arc. 2043 * 2044 * In the case of the 'primarycache' a buffer 2045 * is considered for eviction if it matches the 2046 * criteria set in the property. 2047 * 2048 * To decide if our buffer is considered a 2049 * duplicate, we must call into the arc to determine 2050 * if multiple buffers are referencing the same 2051 * block on-disk. If so, then we simply evict 2052 * ourselves. 2053 */ 2054 if (!DBUF_IS_CACHEABLE(db)) { 2055 if (db->db_blkptr != NULL && 2056 !BP_IS_HOLE(db->db_blkptr) && 2057 !BP_IS_EMBEDDED(db->db_blkptr)) { 2058 spa_t *spa = 2059 dmu_objset_spa(db->db_objset); 2060 blkptr_t bp = *db->db_blkptr; 2061 dbuf_clear(db); 2062 arc_freed(spa, &bp); 2063 } else { 2064 dbuf_clear(db); 2065 } 2066 } else if (arc_buf_eviction_needed(db->db_buf)) { 2067 dbuf_clear(db); 2068 } else { 2069 mutex_exit(&db->db_mtx); 2070 } 2071 } 2072 } else { 2073 mutex_exit(&db->db_mtx); 2074 } 2075 } 2076 2077 #pragma weak dmu_buf_refcount = dbuf_refcount 2078 uint64_t 2079 dbuf_refcount(dmu_buf_impl_t *db) 2080 { 2081 return (refcount_count(&db->db_holds)); 2082 } 2083 2084 void * 2085 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2086 dmu_buf_evict_func_t *evict_func) 2087 { 2088 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2089 user_data_ptr_ptr, evict_func)); 2090 } 2091 2092 void * 2093 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2094 dmu_buf_evict_func_t *evict_func) 2095 { 2096 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2097 2098 db->db_immediate_evict = TRUE; 2099 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2100 user_data_ptr_ptr, evict_func)); 2101 } 2102 2103 void * 2104 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2105 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 2106 { 2107 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2108 ASSERT(db->db_level == 0); 2109 2110 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2111 2112 mutex_enter(&db->db_mtx); 2113 2114 if (db->db_user_ptr == old_user_ptr) { 2115 db->db_user_ptr = user_ptr; 2116 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 2117 db->db_evict_func = evict_func; 2118 2119 dbuf_update_data(db); 2120 } else { 2121 old_user_ptr = db->db_user_ptr; 2122 } 2123 2124 mutex_exit(&db->db_mtx); 2125 return (old_user_ptr); 2126 } 2127 2128 void * 2129 dmu_buf_get_user(dmu_buf_t *db_fake) 2130 { 2131 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2132 ASSERT(!refcount_is_zero(&db->db_holds)); 2133 2134 return (db->db_user_ptr); 2135 } 2136 2137 boolean_t 2138 dmu_buf_freeable(dmu_buf_t *dbuf) 2139 { 2140 boolean_t res = B_FALSE; 2141 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2142 2143 if (db->db_blkptr) 2144 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2145 db->db_blkptr, db->db_blkptr->blk_birth); 2146 2147 return (res); 2148 } 2149 2150 blkptr_t * 2151 dmu_buf_get_blkptr(dmu_buf_t *db) 2152 { 2153 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2154 return (dbi->db_blkptr); 2155 } 2156 2157 static void 2158 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2159 { 2160 /* ASSERT(dmu_tx_is_syncing(tx) */ 2161 ASSERT(MUTEX_HELD(&db->db_mtx)); 2162 2163 if (db->db_blkptr != NULL) 2164 return; 2165 2166 if (db->db_blkid == DMU_SPILL_BLKID) { 2167 db->db_blkptr = &dn->dn_phys->dn_spill; 2168 BP_ZERO(db->db_blkptr); 2169 return; 2170 } 2171 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2172 /* 2173 * This buffer was allocated at a time when there was 2174 * no available blkptrs from the dnode, or it was 2175 * inappropriate to hook it in (i.e., nlevels mis-match). 2176 */ 2177 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2178 ASSERT(db->db_parent == NULL); 2179 db->db_parent = dn->dn_dbuf; 2180 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2181 DBUF_VERIFY(db); 2182 } else { 2183 dmu_buf_impl_t *parent = db->db_parent; 2184 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2185 2186 ASSERT(dn->dn_phys->dn_nlevels > 1); 2187 if (parent == NULL) { 2188 mutex_exit(&db->db_mtx); 2189 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2190 (void) dbuf_hold_impl(dn, db->db_level+1, 2191 db->db_blkid >> epbs, FALSE, db, &parent); 2192 rw_exit(&dn->dn_struct_rwlock); 2193 mutex_enter(&db->db_mtx); 2194 db->db_parent = parent; 2195 } 2196 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2197 (db->db_blkid & ((1ULL << epbs) - 1)); 2198 DBUF_VERIFY(db); 2199 } 2200 } 2201 2202 static void 2203 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2204 { 2205 dmu_buf_impl_t *db = dr->dr_dbuf; 2206 dnode_t *dn; 2207 zio_t *zio; 2208 2209 ASSERT(dmu_tx_is_syncing(tx)); 2210 2211 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2212 2213 mutex_enter(&db->db_mtx); 2214 2215 ASSERT(db->db_level > 0); 2216 DBUF_VERIFY(db); 2217 2218 /* Read the block if it hasn't been read yet. */ 2219 if (db->db_buf == NULL) { 2220 mutex_exit(&db->db_mtx); 2221 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2222 mutex_enter(&db->db_mtx); 2223 } 2224 ASSERT3U(db->db_state, ==, DB_CACHED); 2225 ASSERT(db->db_buf != NULL); 2226 2227 DB_DNODE_ENTER(db); 2228 dn = DB_DNODE(db); 2229 /* Indirect block size must match what the dnode thinks it is. */ 2230 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2231 dbuf_check_blkptr(dn, db); 2232 DB_DNODE_EXIT(db); 2233 2234 /* Provide the pending dirty record to child dbufs */ 2235 db->db_data_pending = dr; 2236 2237 mutex_exit(&db->db_mtx); 2238 dbuf_write(dr, db->db_buf, tx); 2239 2240 zio = dr->dr_zio; 2241 mutex_enter(&dr->dt.di.dr_mtx); 2242 dbuf_sync_list(&dr->dt.di.dr_children, tx); 2243 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2244 mutex_exit(&dr->dt.di.dr_mtx); 2245 zio_nowait(zio); 2246 } 2247 2248 static void 2249 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2250 { 2251 arc_buf_t **datap = &dr->dt.dl.dr_data; 2252 dmu_buf_impl_t *db = dr->dr_dbuf; 2253 dnode_t *dn; 2254 objset_t *os; 2255 uint64_t txg = tx->tx_txg; 2256 2257 ASSERT(dmu_tx_is_syncing(tx)); 2258 2259 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2260 2261 mutex_enter(&db->db_mtx); 2262 /* 2263 * To be synced, we must be dirtied. But we 2264 * might have been freed after the dirty. 2265 */ 2266 if (db->db_state == DB_UNCACHED) { 2267 /* This buffer has been freed since it was dirtied */ 2268 ASSERT(db->db.db_data == NULL); 2269 } else if (db->db_state == DB_FILL) { 2270 /* This buffer was freed and is now being re-filled */ 2271 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2272 } else { 2273 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2274 } 2275 DBUF_VERIFY(db); 2276 2277 DB_DNODE_ENTER(db); 2278 dn = DB_DNODE(db); 2279 2280 if (db->db_blkid == DMU_SPILL_BLKID) { 2281 mutex_enter(&dn->dn_mtx); 2282 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2283 mutex_exit(&dn->dn_mtx); 2284 } 2285 2286 /* 2287 * If this is a bonus buffer, simply copy the bonus data into the 2288 * dnode. It will be written out when the dnode is synced (and it 2289 * will be synced, since it must have been dirty for dbuf_sync to 2290 * be called). 2291 */ 2292 if (db->db_blkid == DMU_BONUS_BLKID) { 2293 dbuf_dirty_record_t **drp; 2294 2295 ASSERT(*datap != NULL); 2296 ASSERT0(db->db_level); 2297 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2298 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2299 DB_DNODE_EXIT(db); 2300 2301 if (*datap != db->db.db_data) { 2302 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2303 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2304 } 2305 db->db_data_pending = NULL; 2306 drp = &db->db_last_dirty; 2307 while (*drp != dr) 2308 drp = &(*drp)->dr_next; 2309 ASSERT(dr->dr_next == NULL); 2310 ASSERT(dr->dr_dbuf == db); 2311 *drp = dr->dr_next; 2312 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2313 ASSERT(db->db_dirtycnt > 0); 2314 db->db_dirtycnt -= 1; 2315 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2316 return; 2317 } 2318 2319 os = dn->dn_objset; 2320 2321 /* 2322 * This function may have dropped the db_mtx lock allowing a dmu_sync 2323 * operation to sneak in. As a result, we need to ensure that we 2324 * don't check the dr_override_state until we have returned from 2325 * dbuf_check_blkptr. 2326 */ 2327 dbuf_check_blkptr(dn, db); 2328 2329 /* 2330 * If this buffer is in the middle of an immediate write, 2331 * wait for the synchronous IO to complete. 2332 */ 2333 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2334 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2335 cv_wait(&db->db_changed, &db->db_mtx); 2336 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2337 } 2338 2339 if (db->db_state != DB_NOFILL && 2340 dn->dn_object != DMU_META_DNODE_OBJECT && 2341 refcount_count(&db->db_holds) > 1 && 2342 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2343 *datap == db->db_buf) { 2344 /* 2345 * If this buffer is currently "in use" (i.e., there 2346 * are active holds and db_data still references it), 2347 * then make a copy before we start the write so that 2348 * any modifications from the open txg will not leak 2349 * into this write. 2350 * 2351 * NOTE: this copy does not need to be made for 2352 * objects only modified in the syncing context (e.g. 2353 * DNONE_DNODE blocks). 2354 */ 2355 int blksz = arc_buf_size(*datap); 2356 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2357 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2358 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2359 } 2360 db->db_data_pending = dr; 2361 2362 mutex_exit(&db->db_mtx); 2363 2364 dbuf_write(dr, *datap, tx); 2365 2366 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2367 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2368 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2369 DB_DNODE_EXIT(db); 2370 } else { 2371 /* 2372 * Although zio_nowait() does not "wait for an IO", it does 2373 * initiate the IO. If this is an empty write it seems plausible 2374 * that the IO could actually be completed before the nowait 2375 * returns. We need to DB_DNODE_EXIT() first in case 2376 * zio_nowait() invalidates the dbuf. 2377 */ 2378 DB_DNODE_EXIT(db); 2379 zio_nowait(dr->dr_zio); 2380 } 2381 } 2382 2383 void 2384 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2385 { 2386 dbuf_dirty_record_t *dr; 2387 2388 while (dr = list_head(list)) { 2389 if (dr->dr_zio != NULL) { 2390 /* 2391 * If we find an already initialized zio then we 2392 * are processing the meta-dnode, and we have finished. 2393 * The dbufs for all dnodes are put back on the list 2394 * during processing, so that we can zio_wait() 2395 * these IOs after initiating all child IOs. 2396 */ 2397 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2398 DMU_META_DNODE_OBJECT); 2399 break; 2400 } 2401 list_remove(list, dr); 2402 if (dr->dr_dbuf->db_level > 0) 2403 dbuf_sync_indirect(dr, tx); 2404 else 2405 dbuf_sync_leaf(dr, tx); 2406 } 2407 } 2408 2409 /* ARGSUSED */ 2410 static void 2411 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2412 { 2413 dmu_buf_impl_t *db = vdb; 2414 dnode_t *dn; 2415 blkptr_t *bp = zio->io_bp; 2416 blkptr_t *bp_orig = &zio->io_bp_orig; 2417 spa_t *spa = zio->io_spa; 2418 int64_t delta; 2419 uint64_t fill = 0; 2420 int i; 2421 2422 ASSERT3P(db->db_blkptr, ==, bp); 2423 2424 DB_DNODE_ENTER(db); 2425 dn = DB_DNODE(db); 2426 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2427 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2428 zio->io_prev_space_delta = delta; 2429 2430 if (bp->blk_birth != 0) { 2431 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2432 BP_GET_TYPE(bp) == dn->dn_type) || 2433 (db->db_blkid == DMU_SPILL_BLKID && 2434 BP_GET_TYPE(bp) == dn->dn_bonustype) || 2435 BP_IS_EMBEDDED(bp)); 2436 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2437 } 2438 2439 mutex_enter(&db->db_mtx); 2440 2441 #ifdef ZFS_DEBUG 2442 if (db->db_blkid == DMU_SPILL_BLKID) { 2443 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2444 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2445 db->db_blkptr == &dn->dn_phys->dn_spill); 2446 } 2447 #endif 2448 2449 if (db->db_level == 0) { 2450 mutex_enter(&dn->dn_mtx); 2451 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2452 db->db_blkid != DMU_SPILL_BLKID) 2453 dn->dn_phys->dn_maxblkid = db->db_blkid; 2454 mutex_exit(&dn->dn_mtx); 2455 2456 if (dn->dn_type == DMU_OT_DNODE) { 2457 dnode_phys_t *dnp = db->db.db_data; 2458 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2459 i--, dnp++) { 2460 if (dnp->dn_type != DMU_OT_NONE) 2461 fill++; 2462 } 2463 } else { 2464 if (BP_IS_HOLE(bp)) { 2465 fill = 0; 2466 } else { 2467 fill = 1; 2468 } 2469 } 2470 } else { 2471 blkptr_t *ibp = db->db.db_data; 2472 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2473 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2474 if (BP_IS_HOLE(ibp)) 2475 continue; 2476 fill += BP_GET_FILL(ibp); 2477 } 2478 } 2479 DB_DNODE_EXIT(db); 2480 2481 if (!BP_IS_EMBEDDED(bp)) 2482 bp->blk_fill = fill; 2483 2484 mutex_exit(&db->db_mtx); 2485 } 2486 2487 /* 2488 * The SPA will call this callback several times for each zio - once 2489 * for every physical child i/o (zio->io_phys_children times). This 2490 * allows the DMU to monitor the progress of each logical i/o. For example, 2491 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 2492 * block. There may be a long delay before all copies/fragments are completed, 2493 * so this callback allows us to retire dirty space gradually, as the physical 2494 * i/os complete. 2495 */ 2496 /* ARGSUSED */ 2497 static void 2498 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 2499 { 2500 dmu_buf_impl_t *db = arg; 2501 objset_t *os = db->db_objset; 2502 dsl_pool_t *dp = dmu_objset_pool(os); 2503 dbuf_dirty_record_t *dr; 2504 int delta = 0; 2505 2506 dr = db->db_data_pending; 2507 ASSERT3U(dr->dr_txg, ==, zio->io_txg); 2508 2509 /* 2510 * The callback will be called io_phys_children times. Retire one 2511 * portion of our dirty space each time we are called. Any rounding 2512 * error will be cleaned up by dsl_pool_sync()'s call to 2513 * dsl_pool_undirty_space(). 2514 */ 2515 delta = dr->dr_accounted / zio->io_phys_children; 2516 dsl_pool_undirty_space(dp, delta, zio->io_txg); 2517 } 2518 2519 /* ARGSUSED */ 2520 static void 2521 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2522 { 2523 dmu_buf_impl_t *db = vdb; 2524 blkptr_t *bp_orig = &zio->io_bp_orig; 2525 blkptr_t *bp = db->db_blkptr; 2526 objset_t *os = db->db_objset; 2527 dmu_tx_t *tx = os->os_synctx; 2528 dbuf_dirty_record_t **drp, *dr; 2529 2530 ASSERT0(zio->io_error); 2531 ASSERT(db->db_blkptr == bp); 2532 2533 /* 2534 * For nopwrites and rewrites we ensure that the bp matches our 2535 * original and bypass all the accounting. 2536 */ 2537 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2538 ASSERT(BP_EQUAL(bp, bp_orig)); 2539 } else { 2540 dsl_dataset_t *ds = os->os_dsl_dataset; 2541 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2542 dsl_dataset_block_born(ds, bp, tx); 2543 } 2544 2545 mutex_enter(&db->db_mtx); 2546 2547 DBUF_VERIFY(db); 2548 2549 drp = &db->db_last_dirty; 2550 while ((dr = *drp) != db->db_data_pending) 2551 drp = &dr->dr_next; 2552 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2553 ASSERT(dr->dr_dbuf == db); 2554 ASSERT(dr->dr_next == NULL); 2555 *drp = dr->dr_next; 2556 2557 #ifdef ZFS_DEBUG 2558 if (db->db_blkid == DMU_SPILL_BLKID) { 2559 dnode_t *dn; 2560 2561 DB_DNODE_ENTER(db); 2562 dn = DB_DNODE(db); 2563 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2564 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2565 db->db_blkptr == &dn->dn_phys->dn_spill); 2566 DB_DNODE_EXIT(db); 2567 } 2568 #endif 2569 2570 if (db->db_level == 0) { 2571 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2572 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2573 if (db->db_state != DB_NOFILL) { 2574 if (dr->dt.dl.dr_data != db->db_buf) 2575 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2576 db)); 2577 else if (!arc_released(db->db_buf)) 2578 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2579 } 2580 } else { 2581 dnode_t *dn; 2582 2583 DB_DNODE_ENTER(db); 2584 dn = DB_DNODE(db); 2585 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2586 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 2587 if (!BP_IS_HOLE(db->db_blkptr)) { 2588 int epbs = 2589 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2590 ASSERT3U(db->db_blkid, <=, 2591 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 2592 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2593 db->db.db_size); 2594 if (!arc_released(db->db_buf)) 2595 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2596 } 2597 DB_DNODE_EXIT(db); 2598 mutex_destroy(&dr->dt.di.dr_mtx); 2599 list_destroy(&dr->dt.di.dr_children); 2600 } 2601 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2602 2603 cv_broadcast(&db->db_changed); 2604 ASSERT(db->db_dirtycnt > 0); 2605 db->db_dirtycnt -= 1; 2606 db->db_data_pending = NULL; 2607 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 2608 } 2609 2610 static void 2611 dbuf_write_nofill_ready(zio_t *zio) 2612 { 2613 dbuf_write_ready(zio, NULL, zio->io_private); 2614 } 2615 2616 static void 2617 dbuf_write_nofill_done(zio_t *zio) 2618 { 2619 dbuf_write_done(zio, NULL, zio->io_private); 2620 } 2621 2622 static void 2623 dbuf_write_override_ready(zio_t *zio) 2624 { 2625 dbuf_dirty_record_t *dr = zio->io_private; 2626 dmu_buf_impl_t *db = dr->dr_dbuf; 2627 2628 dbuf_write_ready(zio, NULL, db); 2629 } 2630 2631 static void 2632 dbuf_write_override_done(zio_t *zio) 2633 { 2634 dbuf_dirty_record_t *dr = zio->io_private; 2635 dmu_buf_impl_t *db = dr->dr_dbuf; 2636 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2637 2638 mutex_enter(&db->db_mtx); 2639 if (!BP_EQUAL(zio->io_bp, obp)) { 2640 if (!BP_IS_HOLE(obp)) 2641 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2642 arc_release(dr->dt.dl.dr_data, db); 2643 } 2644 mutex_exit(&db->db_mtx); 2645 2646 dbuf_write_done(zio, NULL, db); 2647 } 2648 2649 /* Issue I/O to commit a dirty buffer to disk. */ 2650 static void 2651 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2652 { 2653 dmu_buf_impl_t *db = dr->dr_dbuf; 2654 dnode_t *dn; 2655 objset_t *os; 2656 dmu_buf_impl_t *parent = db->db_parent; 2657 uint64_t txg = tx->tx_txg; 2658 zbookmark_phys_t zb; 2659 zio_prop_t zp; 2660 zio_t *zio; 2661 int wp_flag = 0; 2662 2663 DB_DNODE_ENTER(db); 2664 dn = DB_DNODE(db); 2665 os = dn->dn_objset; 2666 2667 if (db->db_state != DB_NOFILL) { 2668 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2669 /* 2670 * Private object buffers are released here rather 2671 * than in dbuf_dirty() since they are only modified 2672 * in the syncing context and we don't want the 2673 * overhead of making multiple copies of the data. 2674 */ 2675 if (BP_IS_HOLE(db->db_blkptr)) { 2676 arc_buf_thaw(data); 2677 } else { 2678 dbuf_release_bp(db); 2679 } 2680 } 2681 } 2682 2683 if (parent != dn->dn_dbuf) { 2684 /* Our parent is an indirect block. */ 2685 /* We have a dirty parent that has been scheduled for write. */ 2686 ASSERT(parent && parent->db_data_pending); 2687 /* Our parent's buffer is one level closer to the dnode. */ 2688 ASSERT(db->db_level == parent->db_level-1); 2689 /* 2690 * We're about to modify our parent's db_data by modifying 2691 * our block pointer, so the parent must be released. 2692 */ 2693 ASSERT(arc_released(parent->db_buf)); 2694 zio = parent->db_data_pending->dr_zio; 2695 } else { 2696 /* Our parent is the dnode itself. */ 2697 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2698 db->db_blkid != DMU_SPILL_BLKID) || 2699 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2700 if (db->db_blkid != DMU_SPILL_BLKID) 2701 ASSERT3P(db->db_blkptr, ==, 2702 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2703 zio = dn->dn_zio; 2704 } 2705 2706 ASSERT(db->db_level == 0 || data == db->db_buf); 2707 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2708 ASSERT(zio); 2709 2710 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2711 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2712 db->db.db_object, db->db_level, db->db_blkid); 2713 2714 if (db->db_blkid == DMU_SPILL_BLKID) 2715 wp_flag = WP_SPILL; 2716 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2717 WP_SET_SPECIALCLASS(wp_flag, dr->dr_usesc); 2718 2719 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2720 DB_DNODE_EXIT(db); 2721 2722 if (db->db_level == 0 && 2723 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2724 /* 2725 * The BP for this block has been provided by open context 2726 * (by dmu_sync() or dmu_buf_write_embedded()). 2727 */ 2728 void *contents = (data != NULL) ? data->b_data : NULL; 2729 2730 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2731 db->db_blkptr, contents, db->db.db_size, &zp, 2732 dbuf_write_override_ready, NULL, dbuf_write_override_done, 2733 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2734 mutex_enter(&db->db_mtx); 2735 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2736 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2737 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 2738 mutex_exit(&db->db_mtx); 2739 } else if (db->db_state == DB_NOFILL) { 2740 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 2741 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 2742 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2743 db->db_blkptr, NULL, db->db.db_size, &zp, 2744 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 2745 ZIO_PRIORITY_ASYNC_WRITE, 2746 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2747 } else { 2748 ASSERT(arc_released(data)); 2749 dr->dr_zio = arc_write(zio, os->os_spa, txg, 2750 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), 2751 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 2752 dbuf_write_physdone, dbuf_write_done, db, 2753 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2754 } 2755 } --- EOF ---