Print this page
patch nuke-the-dbuf-hash
patch make-the-merge-easy

*** 82,222 **** mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); refcount_destroy(&db->db_holds); } - /* - * dbuf hash table routines - */ - #pragma align 64(dbuf_hash_table) - static dbuf_hash_table_t dbuf_hash_table; - - static uint64_t dbuf_hash_count; - - static uint64_t - dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) - { - uintptr_t osv = (uintptr_t)os; - uint64_t crc = -1ULL; - - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; - - crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); - - return (crc); - } - - #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); - - #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ - ((dbuf)->db.db_object == (obj) && \ - (dbuf)->db_objset == (os) && \ - (dbuf)->db_level == (level) && \ - (dbuf)->db_blkid == (blkid)) - dmu_buf_impl_t * dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) { - dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = dn->dn_objset; uint64_t obj = dn->dn_object; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); - uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *db; - mutex_enter(DBUF_HASH_MUTEX(h, idx)); - for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { - if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { ! mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } - } - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - return (NULL); - } - - /* - * Insert an entry into the hash table. If there is already an element - * equal to elem in the hash table, then the already existing element - * will be returned and the new element will not be inserted. - * Otherwise returns NULL. - */ - static dmu_buf_impl_t * - dbuf_hash_insert(dmu_buf_impl_t *db) - { - dbuf_hash_table_t *h = &dbuf_hash_table; - objset_t *os = db->db_objset; - uint64_t obj = db->db.db_object; - int level = db->db_level; - uint64_t blkid = db->db_blkid; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); - uint64_t idx = hv & h->hash_table_mask; - dmu_buf_impl_t *dbf; - - mutex_enter(DBUF_HASH_MUTEX(h, idx)); - for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { - if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { - mutex_enter(&dbf->db_mtx); - if (dbf->db_state != DB_EVICTING) { - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - return (dbf); - } - mutex_exit(&dbf->db_mtx); - } - } - - mutex_enter(&db->db_mtx); - db->db_hash_next = h->hash_table[idx]; - h->hash_table[idx] = db; - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_inc_64(&dbuf_hash_count); return (NULL); } - /* - * Remove an entry from the hash table. It must be in the EVICTING state. - */ - static void - dbuf_hash_remove(dmu_buf_impl_t *db) - { - dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, - db->db_level, db->db_blkid); - uint64_t idx = hv & h->hash_table_mask; - dmu_buf_impl_t *dbf, **dbp; - - /* - * We musn't hold db_mtx to maintain lock ordering: - * DBUF_HASH_MUTEX > db_mtx. - */ - ASSERT(refcount_is_zero(&db->db_holds)); - ASSERT(db->db_state == DB_EVICTING); - ASSERT(!MUTEX_HELD(&db->db_mtx)); - - mutex_enter(DBUF_HASH_MUTEX(h, idx)); - dbp = &h->hash_table[idx]; - while ((dbf = *dbp) != db) { - dbp = &dbf->db_hash_next; - ASSERT(dbf != NULL); - } - *dbp = db->db_hash_next; - db->db_hash_next = NULL; - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_dec_64(&dbuf_hash_count); - } - static arc_evict_func_t dbuf_do_evict; static void dbuf_evict_user(dmu_buf_impl_t *db) { --- 82,125 ---- mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); refcount_destroy(&db->db_holds); } dmu_buf_impl_t * dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) { objset_t *os = dn->dn_objset; uint64_t obj = dn->dn_object; dmu_buf_impl_t *db; + dmu_buf_impl_t key; + avl_index_t where; + + key.db_level = level; + key.db_blkid = blkid; + key.db_state = DB_SEARCH; + + mutex_enter(&dn->dn_dbufs_mtx); + db = avl_find(&dn->dn_dbufs, &key, &where); + ASSERT3P(db, ==, NULL); + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + for (; db; db = AVL_NEXT(&dn->dn_dbufs, db)) { + if ((db->db_level != level) || (db->db_blkid != blkid)) + break; mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { ! mutex_exit(&dn->dn_dbufs_mtx); return (db); } mutex_exit(&db->db_mtx); } + mutex_exit(&dn->dn_dbufs_mtx); return (NULL); } static arc_evict_func_t dbuf_do_evict; static void dbuf_evict_user(dmu_buf_impl_t *db) {
*** 261,309 **** } void dbuf_init(void) { - uint64_t hsize = 1ULL << 16; - dbuf_hash_table_t *h = &dbuf_hash_table; - int i; - - /* - * The hash table is big enough to fill all of physical memory - * with an average 4K block size. The table will take up - * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). - */ - while (hsize * 4096 < physmem * PAGESIZE) - hsize <<= 1; - - retry: - h->hash_table_mask = hsize - 1; - h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); - if (h->hash_table == NULL) { - /* XXX - we should really return an error instead of assert */ - ASSERT(hsize > (1ULL << 10)); - hsize >>= 1; - goto retry; - } - dbuf_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_init(DBUF_HASH_MUTEX(h, i), NULL, MUTEX_DEFAULT, NULL); } void dbuf_fini(void) { - dbuf_hash_table_t *h = &dbuf_hash_table; - int i; - - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_destroy(DBUF_HASH_MUTEX(h, i)); - kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); kmem_cache_destroy(dbuf_cache); } /* * Other stuff. --- 164,181 ----
*** 1738,1747 **** --- 1610,1620 ---- dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr) { objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; + avl_index_t where; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
*** 1767,1777 **** db->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; ! /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; --- 1640,1650 ---- db->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; ! /* the bonus dbuf is not placed into the dnode's dbuf tree */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
*** 1781,1806 **** db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } - /* - * Hold the dn_dbufs_mtx while we get the new dbuf - * in the hash table *and* added to the dbufs list. - * This prevents a possible deadlock with someone - * trying to look up this dbuf before its added to the - * dn_dbufs list. - */ mutex_enter(&dn->dn_dbufs_mtx); db->db_state = DB_EVICTING; ! if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ kmem_cache_free(dbuf_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } ! avl_add(&dn->dn_dbufs, db); if (db->db_level == 0 && db->db_blkid >= dn->dn_unlisted_l0_blkid) dn->dn_unlisted_l0_blkid = db->db_blkid + 1; db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); --- 1654,1675 ---- db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } mutex_enter(&dn->dn_dbufs_mtx); + mutex_enter(&db->db_mtx); db->db_state = DB_EVICTING; ! if ((odb = avl_find(&dn->dn_dbufs, db, &where))) { /* someone else inserted it first */ + mutex_exit(&db->db_mtx); kmem_cache_free(dbuf_cache, db); + mutex_enter(&odb->db_mtx); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } ! avl_insert(&dn->dn_dbufs, db, where); if (db->db_level == 0 && db->db_blkid >= dn->dn_unlisted_l0_blkid) dn->dn_unlisted_l0_blkid = db->db_blkid + 1; db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx);
*** 1868,1884 **** * moved until after we release the hold. */ dnode_rele(dn, db); db->db_dnode_handle = NULL; } - dbuf_hash_remove(db); } db->db_parent = NULL; db->db_buf = NULL; ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); kmem_cache_free(dbuf_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); --- 1737,1751 ----