Print this page
patch nuke-the-dbuf-hash
patch make-the-merge-easy
*** 82,222 ****
mutex_destroy(&db->db_mtx);
cv_destroy(&db->db_changed);
refcount_destroy(&db->db_holds);
}
- /*
- * dbuf hash table routines
- */
- #pragma align 64(dbuf_hash_table)
- static dbuf_hash_table_t dbuf_hash_table;
-
- static uint64_t dbuf_hash_count;
-
- static uint64_t
- dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
- {
- uintptr_t osv = (uintptr_t)os;
- uint64_t crc = -1ULL;
-
- ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
-
- crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
-
- return (crc);
- }
-
- #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
-
- #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
- ((dbuf)->db.db_object == (obj) && \
- (dbuf)->db_objset == (os) && \
- (dbuf)->db_level == (level) && \
- (dbuf)->db_blkid == (blkid))
-
dmu_buf_impl_t *
dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
{
- dbuf_hash_table_t *h = &dbuf_hash_table;
objset_t *os = dn->dn_objset;
uint64_t obj = dn->dn_object;
- uint64_t hv = DBUF_HASH(os, obj, level, blkid);
- uint64_t idx = hv & h->hash_table_mask;
dmu_buf_impl_t *db;
- mutex_enter(DBUF_HASH_MUTEX(h, idx));
- for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
- if (DBUF_EQUAL(db, os, obj, level, blkid)) {
mutex_enter(&db->db_mtx);
if (db->db_state != DB_EVICTING) {
! mutex_exit(DBUF_HASH_MUTEX(h, idx));
return (db);
}
mutex_exit(&db->db_mtx);
}
- }
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- return (NULL);
- }
-
- /*
- * Insert an entry into the hash table. If there is already an element
- * equal to elem in the hash table, then the already existing element
- * will be returned and the new element will not be inserted.
- * Otherwise returns NULL.
- */
- static dmu_buf_impl_t *
- dbuf_hash_insert(dmu_buf_impl_t *db)
- {
- dbuf_hash_table_t *h = &dbuf_hash_table;
- objset_t *os = db->db_objset;
- uint64_t obj = db->db.db_object;
- int level = db->db_level;
- uint64_t blkid = db->db_blkid;
- uint64_t hv = DBUF_HASH(os, obj, level, blkid);
- uint64_t idx = hv & h->hash_table_mask;
- dmu_buf_impl_t *dbf;
-
- mutex_enter(DBUF_HASH_MUTEX(h, idx));
- for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
- if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
- mutex_enter(&dbf->db_mtx);
- if (dbf->db_state != DB_EVICTING) {
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- return (dbf);
- }
- mutex_exit(&dbf->db_mtx);
- }
- }
-
- mutex_enter(&db->db_mtx);
- db->db_hash_next = h->hash_table[idx];
- h->hash_table[idx] = db;
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- atomic_inc_64(&dbuf_hash_count);
return (NULL);
}
- /*
- * Remove an entry from the hash table. It must be in the EVICTING state.
- */
- static void
- dbuf_hash_remove(dmu_buf_impl_t *db)
- {
- dbuf_hash_table_t *h = &dbuf_hash_table;
- uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
- db->db_level, db->db_blkid);
- uint64_t idx = hv & h->hash_table_mask;
- dmu_buf_impl_t *dbf, **dbp;
-
- /*
- * We musn't hold db_mtx to maintain lock ordering:
- * DBUF_HASH_MUTEX > db_mtx.
- */
- ASSERT(refcount_is_zero(&db->db_holds));
- ASSERT(db->db_state == DB_EVICTING);
- ASSERT(!MUTEX_HELD(&db->db_mtx));
-
- mutex_enter(DBUF_HASH_MUTEX(h, idx));
- dbp = &h->hash_table[idx];
- while ((dbf = *dbp) != db) {
- dbp = &dbf->db_hash_next;
- ASSERT(dbf != NULL);
- }
- *dbp = db->db_hash_next;
- db->db_hash_next = NULL;
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- atomic_dec_64(&dbuf_hash_count);
- }
-
static arc_evict_func_t dbuf_do_evict;
static void
dbuf_evict_user(dmu_buf_impl_t *db)
{
--- 82,125 ----
mutex_destroy(&db->db_mtx);
cv_destroy(&db->db_changed);
refcount_destroy(&db->db_holds);
}
dmu_buf_impl_t *
dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
{
objset_t *os = dn->dn_objset;
uint64_t obj = dn->dn_object;
dmu_buf_impl_t *db;
+ dmu_buf_impl_t key;
+ avl_index_t where;
+
+ key.db_level = level;
+ key.db_blkid = blkid;
+ key.db_state = DB_SEARCH;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ db = avl_find(&dn->dn_dbufs, &key, &where);
+ ASSERT3P(db, ==, NULL);
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+ for (; db; db = AVL_NEXT(&dn->dn_dbufs, db)) {
+ if ((db->db_level != level) || (db->db_blkid != blkid))
+ break;
mutex_enter(&db->db_mtx);
if (db->db_state != DB_EVICTING) {
! mutex_exit(&dn->dn_dbufs_mtx);
return (db);
}
mutex_exit(&db->db_mtx);
}
+ mutex_exit(&dn->dn_dbufs_mtx);
return (NULL);
}
static arc_evict_func_t dbuf_do_evict;
static void
dbuf_evict_user(dmu_buf_impl_t *db)
{
*** 261,309 ****
}
void
dbuf_init(void)
{
- uint64_t hsize = 1ULL << 16;
- dbuf_hash_table_t *h = &dbuf_hash_table;
- int i;
-
- /*
- * The hash table is big enough to fill all of physical memory
- * with an average 4K block size. The table will take up
- * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
- */
- while (hsize * 4096 < physmem * PAGESIZE)
- hsize <<= 1;
-
- retry:
- h->hash_table_mask = hsize - 1;
- h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
- if (h->hash_table == NULL) {
- /* XXX - we should really return an error instead of assert */
- ASSERT(hsize > (1ULL << 10));
- hsize >>= 1;
- goto retry;
- }
-
dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
sizeof (dmu_buf_impl_t),
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
-
- for (i = 0; i < DBUF_MUTEXES; i++)
- mutex_init(DBUF_HASH_MUTEX(h, i), NULL, MUTEX_DEFAULT, NULL);
}
void
dbuf_fini(void)
{
- dbuf_hash_table_t *h = &dbuf_hash_table;
- int i;
-
- for (i = 0; i < DBUF_MUTEXES; i++)
- mutex_destroy(DBUF_HASH_MUTEX(h, i));
- kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
kmem_cache_destroy(dbuf_cache);
}
/*
* Other stuff.
--- 164,181 ----
*** 1738,1747 ****
--- 1610,1620 ----
dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
dmu_buf_impl_t *parent, blkptr_t *blkptr)
{
objset_t *os = dn->dn_objset;
dmu_buf_impl_t *db, *odb;
+ avl_index_t where;
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_type != DMU_OT_NONE);
db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
*** 1767,1777 ****
db->db.db_size = DN_MAX_BONUSLEN -
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DMU_BONUS_BLKID;
db->db_state = DB_UNCACHED;
! /* the bonus dbuf is not placed in the hash table */
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
return (db);
} else if (blkid == DMU_SPILL_BLKID) {
db->db.db_size = (blkptr != NULL) ?
BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
--- 1640,1650 ----
db->db.db_size = DN_MAX_BONUSLEN -
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DMU_BONUS_BLKID;
db->db_state = DB_UNCACHED;
! /* the bonus dbuf is not placed into the dnode's dbuf tree */
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
return (db);
} else if (blkid == DMU_SPILL_BLKID) {
db->db.db_size = (blkptr != NULL) ?
BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
*** 1781,1806 ****
db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
db->db.db_size = blocksize;
db->db.db_offset = db->db_blkid * blocksize;
}
- /*
- * Hold the dn_dbufs_mtx while we get the new dbuf
- * in the hash table *and* added to the dbufs list.
- * This prevents a possible deadlock with someone
- * trying to look up this dbuf before its added to the
- * dn_dbufs list.
- */
mutex_enter(&dn->dn_dbufs_mtx);
db->db_state = DB_EVICTING;
! if ((odb = dbuf_hash_insert(db)) != NULL) {
/* someone else inserted it first */
kmem_cache_free(dbuf_cache, db);
mutex_exit(&dn->dn_dbufs_mtx);
return (odb);
}
! avl_add(&dn->dn_dbufs, db);
if (db->db_level == 0 && db->db_blkid >=
dn->dn_unlisted_l0_blkid)
dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx);
--- 1654,1675 ----
db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
db->db.db_size = blocksize;
db->db.db_offset = db->db_blkid * blocksize;
}
mutex_enter(&dn->dn_dbufs_mtx);
+ mutex_enter(&db->db_mtx);
db->db_state = DB_EVICTING;
! if ((odb = avl_find(&dn->dn_dbufs, db, &where))) {
/* someone else inserted it first */
+ mutex_exit(&db->db_mtx);
kmem_cache_free(dbuf_cache, db);
+ mutex_enter(&odb->db_mtx);
mutex_exit(&dn->dn_dbufs_mtx);
return (odb);
}
! avl_insert(&dn->dn_dbufs, db, where);
if (db->db_level == 0 && db->db_blkid >=
dn->dn_unlisted_l0_blkid)
dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx);
*** 1868,1884 ****
* moved until after we release the hold.
*/
dnode_rele(dn, db);
db->db_dnode_handle = NULL;
}
- dbuf_hash_remove(db);
}
db->db_parent = NULL;
db->db_buf = NULL;
ASSERT(db->db.db_data == NULL);
- ASSERT(db->db_hash_next == NULL);
ASSERT(db->db_blkptr == NULL);
ASSERT(db->db_data_pending == NULL);
kmem_cache_free(dbuf_cache, db);
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
--- 1737,1751 ----