summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2020-11-02 23:51:33 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2020-11-05 12:45:15 -0500
commit357cedf1a277a54bbdd5b0d359adb397df2ea662 (patch)
tree441a33a80e7510ee48a4f4987a33530b4dff0642
parentd0e58c76e2d6812556ae4bb93869a8e5faa2477a (diff)
bcachefs: Improved inode create optimization
This shards new inodes into different btree nodes by using the processor ID for the high bits of the new inode number. Much faster than the previous inode create optimization - this also helps with sharding in the other btrees that index by inode number. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--fs/bcachefs/bcachefs.h7
-rw-r--r--fs/bcachefs/inode.c139
-rw-r--r--fs/bcachefs/super.c6
3 files changed, 54 insertions, 98 deletions
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f1e8f6bcce51..10c15d125038 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -813,11 +813,8 @@ struct bch_fs {
struct mutex verify_lock;
#endif
- struct mutex inode_create_lock;
- unsigned unused_inodes_nr;
- u64 unused_inodes[64];
- u32 unused_inodes_gens[64];
- u64 unused_inode_hint;
+ u64 *unused_inode_hints;
+ unsigned inode_shard_bits;
/*
* A btree node on disk could have too many bsets for an iterator to fit
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 9a0991adf550..d7622049069e 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -361,55 +361,6 @@ static inline u32 bkey_generation(struct bkey_s_c k)
}
}
-static int scan_free_inums(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter *iter = NULL;
- struct bkey_s_c k;
- u64 min = BLOCKDEV_INODE_MAX;
- u64 max = c->opts.inodes_32bit
- ? S32_MAX : S64_MAX;
- u64 start = max(min, READ_ONCE(c->unused_inode_hint));
- int ret = 0;
-
- iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, start),
- BTREE_ITER_SLOTS);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
-again:
- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
- if (bkey_cmp(iter->pos, POS(0, max)) > 0)
- break;
-
- /*
- * This doesn't check the btree key cache, but we don't care:
- * we have to recheck with an intent lock held on the slot we're
- * inserting to anyways:
- */
- if (k.k->type != KEY_TYPE_inode) {
- if (c->unused_inodes_nr < ARRAY_SIZE(c->unused_inodes)) {
- c->unused_inodes[c->unused_inodes_nr] = k.k->p.offset;
- c->unused_inodes_gens[c->unused_inodes_nr] = bkey_generation(k);
- c->unused_inodes_nr++;
- }
-
- if (c->unused_inodes_nr == ARRAY_SIZE(c->unused_inodes))
- goto out;
- }
- }
-
- if (!ret && start != min) {
- max = start;
- start = min;
- bch2_btree_iter_set_pos(iter, POS(0, start));
- goto again;
- }
-out:
- c->unused_inode_hint = iter->pos.offset;
- bch2_trans_iter_put(trans, iter);
- return ret;
-}
-
int bch2_inode_create(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u)
{
@@ -417,64 +368,68 @@ int bch2_inode_create(struct btree_trans *trans,
struct bkey_inode_buf *inode_p;
struct btree_iter *iter = NULL;
struct bkey_s_c k;
- u64 inum;
- u32 generation;
- int ret = 0;
+ u64 min, max, start, *hint;
+ int ret;
+
+ unsigned cpu = raw_smp_processor_id();
+ unsigned bits = (c->opts.inodes_32bit
+ ? 31 : 63) - c->inode_shard_bits;
+
+ min = (cpu << bits);
+ max = (cpu << bits) | ~(ULLONG_MAX << bits);
+
+ min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+ hint = c->unused_inode_hints + cpu;
+
+ start = READ_ONCE(*hint);
+
+ if (start >= max || start < min)
+ start = min;
inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
if (IS_ERR(inode_p))
return PTR_ERR(inode_p);
+again:
+ for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ if (bkey_cmp(iter->pos, POS(0, max)) > 0)
+ break;
- iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS_MIN,
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
-retry:
- if (!mutex_trylock(&c->inode_create_lock)) {
- bch2_trans_unlock(trans);
- mutex_lock(&c->inode_create_lock);
- if (!bch2_trans_relock(trans)) {
- mutex_unlock(&c->inode_create_lock);
- ret = -EINTR;
- goto err;
- }
- }
-
- if (!c->unused_inodes_nr)
- ret = scan_free_inums(trans);
- if (!ret && !c->unused_inodes_nr)
- ret = -ENOSPC;
- if (!ret) {
- --c->unused_inodes_nr;
- inum = c->unused_inodes[c->unused_inodes_nr];
- generation = c->unused_inodes_gens[c->unused_inodes_nr];
+ /*
+ * There's a potential cache coherency issue with the btree key
+ * cache code here - we're iterating over the btree, skipping
+ * that cache. We should never see an empty slot that isn't
+ * actually empty due to a pending update in the key cache
+ * because the update that creates the inode isn't done with a
+ * cached iterator, but - better safe than sorry, check the
+ * cache before using a slot:
+ */
+ if (k.k->type != KEY_TYPE_inode &&
+ !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos))
+ goto found_slot;
}
- mutex_unlock(&c->inode_create_lock);
-
- if (ret)
- goto err;
-
- bch2_btree_iter_set_pos(iter, POS(0, inum));
+ bch2_trans_iter_put(trans, iter);
- /* Recheck that the slot is free with an intent lock held: */
- k = bch2_btree_iter_peek_cached(iter);
- ret = bkey_err(k);
if (ret)
- goto err;
+ return ret;
- if (k.k->type == KEY_TYPE_inode)
- goto retry;
+ if (start != min) {
+ /* Retry from start */
+ start = min;
+ goto again;
+ }
- inode_u->bi_inum = inum;
- inode_u->bi_generation = generation;
+ return -ENOSPC;
+found_slot:
+ *hint = k.k->p.offset;
+ inode_u->bi_inum = k.k->p.offset;
+ inode_u->bi_generation = bkey_generation(k);
bch2_inode_pack(inode_p, inode_u);
bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
-err:
bch2_trans_iter_put(trans, iter);
- return ret;
+ return 0;
}
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 0be1e2eec70b..ad1967d5a47d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -485,6 +485,7 @@ static void __bch2_fs_free(struct bch_fs *c)
kfree(c->replicas_gc.entries);
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
+ kfree(c->unused_inode_hints);
free_heap(&c->copygc_heap);
if (c->journal_reclaim_wq)
@@ -696,7 +697,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
seqcount_init(&c->usage_lock);
sema_init(&c->io_in_flight, 64);
- mutex_init(&c->inode_create_lock);
c->copy_gc_enabled = 1;
c->rebalance.enabled = 1;
@@ -737,6 +737,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
(btree_blocks(c) + 1) * 2 *
sizeof(struct sort_iter_set);
+ c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
+
if (!(c->wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
@@ -754,6 +756,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
+ !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
+ sizeof(u64), GFP_KERNEL)) ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||