summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2022-03-04 19:16:04 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2022-05-30 18:17:10 -0400
commitb02113e2c46ed16855bf9dc767d333a2b54e4416 (patch)
tree69400d8153a48a7216cb31bbf7288a74f4ae65b2
parent9ebdefcf488f1dde402669139553dc9bf1f8f37f (diff)
bcachefs: Fix usage of six lock's percpu mode
Six locks have a percpu mode, which we use for interior btree nodes, as well as btree key cache keys for the subvolumes btree. We've been switching locks back and forth between percpu and non percpu mode as needed, but it turns out this is racy - when we're reusing an existing node, other threads could be attempting to lock it while we're switching it between modes. This patch fixes this by never switching 'struct btree' between the two modes, and instead segragating them between two different freed lists. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--fs/bcachefs/btree_cache.c41
-rw-r--r--fs/bcachefs/btree_cache.h2
-rw-r--r--fs/bcachefs/btree_io.c2
-rw-r--r--fs/bcachefs/btree_key_cache.c10
-rw-r--r--fs/bcachefs/btree_types.h3
-rw-r--r--fs/bcachefs/btree_update_interior.c95
-rw-r--r--fs/bcachefs/btree_update_interior.h6
-rw-r--r--fs/bcachefs/debug.c5
8 files changed, 97 insertions, 67 deletions
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 91b6a7c5e4ba..0dcdc30c6888 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -42,6 +42,14 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc)
return max_t(int, 0, bc->used - bc->reserve);
}
+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
+{
+ if (b->c.lock.readers)
+ list_move(&b->list, &bc->freed_pcpu);
+ else
+ list_move(&b->list, &bc->freed_nonpcpu);
+}
+
static void btree_node_data_free(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
@@ -58,7 +66,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
b->aux_data = NULL;
bc->used--;
- list_move(&b->list, &bc->freed);
+
+ btree_node_to_freedlist(bc, b);
}
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -163,11 +172,6 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
b->c.level = level;
b->c.btree_id = id;
- if (level)
- six_lock_pcpu_alloc(&b->c.lock);
- else
- six_lock_pcpu_free_rcu(&b->c.lock);
-
mutex_lock(&bc->lock);
ret = __bch2_btree_node_hash_insert(bc, b);
if (!ret)
@@ -433,8 +437,10 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
BUG_ON(atomic_read(&c->btree_cache.dirty));
- while (!list_empty(&bc->freed)) {
- b = list_first_entry(&bc->freed, struct btree, list);
+ list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+ while (!list_empty(&bc->freed_nonpcpu)) {
+ b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
list_del(&b->list);
six_lock_pcpu_free(&b->c.lock);
kfree(b);
@@ -488,7 +494,8 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
mutex_init(&bc->lock);
INIT_LIST_HEAD(&bc->live);
INIT_LIST_HEAD(&bc->freeable);
- INIT_LIST_HEAD(&bc->freed);
+ INIT_LIST_HEAD(&bc->freed_pcpu);
+ INIT_LIST_HEAD(&bc->freed_nonpcpu);
}
/*
@@ -563,9 +570,12 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
}
}
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
{
struct btree_cache *bc = &c->btree_cache;
+ struct list_head *freed = pcpu_read_locks
+ ? &bc->freed_pcpu
+ : &bc->freed_nonpcpu;
struct btree *b, *b2;
u64 start_time = local_clock();
unsigned flags;
@@ -577,7 +587,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
* We never free struct btree itself, just the memory that holds the on
* disk node. Check the freed list before allocating a new one:
*/
- list_for_each_entry(b, &bc->freed, list)
+ list_for_each_entry(b, freed, list)
if (!btree_node_reclaim(c, b)) {
list_del_init(&b->list);
goto got_node;
@@ -587,6 +597,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
if (!b)
goto err_locked;
+ if (pcpu_read_locks)
+ six_lock_pcpu_alloc(&b->c.lock);
+
BUG_ON(!six_trylock_intent(&b->c.lock));
BUG_ON(!six_trylock_write(&b->c.lock));
got_node:
@@ -599,7 +612,7 @@ got_node:
if (!btree_node_reclaim(c, b2)) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
- list_move(&b2->list, &bc->freed);
+ btree_node_to_freedlist(bc, b2);
six_unlock_write(&b2->c.lock);
six_unlock_intent(&b2->c.lock);
goto got_mem;
@@ -644,7 +657,7 @@ err_locked:
if (b) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
- list_move(&b2->list, &bc->freed);
+ btree_node_to_freedlist(bc, b2);
six_unlock_write(&b2->c.lock);
six_unlock_intent(&b2->c.lock);
} else {
@@ -689,7 +702,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
return ERR_PTR(-EINTR);
}
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, level != 0);
if (trans && b == ERR_PTR(-ENOMEM)) {
trans->memory_allocation_failure = true;
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 2901f0dc925b..25906127c023 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -22,7 +22,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool);
struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
const struct bkey_i *, unsigned,
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 39c0274e88ee..3a12d055c1b7 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1547,7 +1547,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
closure_sync(&cl);
} while (ret);
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, level != 0);
bch2_btree_cache_cannibalize_unlock(c);
BUG_ON(IS_ERR(b));
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 167d177150c4..ee89b650f6a4 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -165,13 +165,13 @@ btree_key_cache_create(struct bch_fs *c,
}
was_new = false;
+ } else {
+ if (btree_id == BTREE_ID_subvolumes)
+ six_lock_pcpu_alloc(&ck->c.lock);
+ else
+ six_lock_pcpu_free(&ck->c.lock);
}
- if (btree_id == BTREE_ID_subvolumes)
- six_lock_pcpu_alloc(&ck->c.lock);
- else
- six_lock_pcpu_free(&ck->c.lock);
-
ck->c.level = 0;
ck->c.btree_id = btree_id;
ck->key.btree_id = btree_id;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 09b6db1d93f2..e6deb3a4494b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -152,7 +152,8 @@ struct btree_cache {
struct mutex lock;
struct list_head live;
struct list_head freeable;
- struct list_head freed;
+ struct list_head freed_pcpu;
+ struct list_head freed_nonpcpu;
/* Number of elements in live + freeable lists */
unsigned used;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 8ce40963d07f..014d607ab6e7 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -181,6 +181,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
struct disk_reservation *res,
struct closure *cl,
+ bool interior_node,
unsigned flags)
{
struct write_point *wp;
@@ -242,7 +243,7 @@ retry:
bch2_open_bucket_get(c, wp, &ob);
bch2_alloc_sectors_done(c, wp);
mem_alloc:
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, interior_node);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -260,12 +261,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
{
struct bch_fs *c = as->c;
struct btree *b;
+ struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
int ret;
BUG_ON(level >= BTREE_MAX_DEPTH);
- BUG_ON(!as->nr_prealloc_nodes);
+ BUG_ON(!p->nr);
- b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+ b = p->b[--p->nr];
six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL);
@@ -377,47 +379,54 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
static void bch2_btree_reserve_put(struct btree_update *as)
{
struct bch_fs *c = as->c;
+ struct prealloc_nodes *p;
mutex_lock(&c->btree_reserve_cache_lock);
- while (as->nr_prealloc_nodes) {
- struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+ for (p = as->prealloc_nodes;
+ p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
+ p++) {
+ while (p->nr) {
+ struct btree *b = p->b[--p->nr];
- six_lock_intent(&b->c.lock, NULL, NULL);
- six_lock_write(&b->c.lock, NULL, NULL);
+ six_lock_intent(&b->c.lock, NULL, NULL);
+ six_lock_write(&b->c.lock, NULL, NULL);
- if (c->btree_reserve_cache_nr <
- ARRAY_SIZE(c->btree_reserve_cache)) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+ if (c->btree_reserve_cache_nr <
+ ARRAY_SIZE(c->btree_reserve_cache)) {
+ struct btree_alloc *a =
+ &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
- a->ob = b->ob;
- b->ob.nr = 0;
- bkey_copy(&a->k, &b->key);
- } else {
- bch2_open_buckets_put(c, &b->ob);
- }
+ a->ob = b->ob;
+ b->ob.nr = 0;
+ bkey_copy(&a->k, &b->key);
+ } else {
+ bch2_open_buckets_put(c, &b->ob);
+ }
- __btree_node_free(c, b);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
+ __btree_node_free(c, b);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ }
}
mutex_unlock(&c->btree_reserve_cache_lock);
}
-static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
+static int bch2_btree_reserve_get(struct btree_update *as,
+ unsigned nr_nodes[2],
unsigned flags)
{
struct bch_fs *c = as->c;
struct closure cl;
struct btree *b;
+ unsigned interior;
int ret;
closure_init_stack(&cl);
retry:
- BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+ BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
/*
* Protects reaping from the btree node cache and using the btree node
@@ -430,16 +439,21 @@ retry:
if (ret)
goto err;
- while (as->nr_prealloc_nodes < nr_nodes) {
- b = __bch2_btree_node_alloc(c, &as->disk_res,
- flags & BTREE_INSERT_NOWAIT
- ? NULL : &cl, flags);
- if (IS_ERR(b)) {
- ret = PTR_ERR(b);
- goto err;
- }
+ for (interior = 0; interior < 2; interior++) {
+ struct prealloc_nodes *p = as->prealloc_nodes + interior;
+
+ while (p->nr < nr_nodes[interior]) {
+ b = __bch2_btree_node_alloc(c, &as->disk_res,
+ flags & BTREE_INSERT_NOWAIT
+ ? NULL : &cl,
+ interior, flags);
+ if (IS_ERR(b)) {
+ ret = PTR_ERR(b);
+ goto err;
+ }
- as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
+ p->b[p->nr++] = b;
+ }
}
bch2_btree_cache_cannibalize_unlock(c);
@@ -452,7 +466,7 @@ err:
if (ret == -EAGAIN)
goto retry;
- trace_btree_reserve_get_fail(c, nr_nodes, &cl);
+ trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], &cl);
return ret;
}
@@ -954,7 +968,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
u64 start_time = local_clock();
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0;
- unsigned nr_nodes = 0;
+ unsigned nr_nodes[2] = { 0, 0 };
unsigned update_level = level;
int journal_flags = 0;
int ret = 0;
@@ -967,7 +981,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
journal_flags |= JOURNAL_RES_GET_NONBLOCK;
while (1) {
- nr_nodes += 1 + split;
+ nr_nodes[!!update_level] += 1 + split;
update_level++;
if (!btree_path_node(path, update_level))
@@ -982,7 +996,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
/* Might have to allocate a new root: */
if (update_level < BTREE_MAX_DEPTH)
- nr_nodes += 1;
+ nr_nodes[1] += 1;
if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
@@ -1046,7 +1060,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
}
ret = bch2_disk_reservation_get(c, &as->disk_res,
- nr_nodes * btree_sectors(c),
+ (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas,
disk_res_flags);
if (ret)
@@ -1076,11 +1090,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
list_del_init(&b->list);
mutex_unlock(&c->btree_cache.lock);
- if (b->c.level)
- six_lock_pcpu_alloc(&b->c.lock);
- else
- six_lock_pcpu_free(&b->c.lock);
-
mutex_lock(&c->btree_root_lock);
BUG_ON(btree_node_root(c, b) &&
(b->c.level < btree_node_root(c, b)->c.level ||
@@ -2007,7 +2016,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
return -EINTR;
}
- new_hash = bch2_btree_node_mem_alloc(c);
+ new_hash = bch2_btree_node_mem_alloc(c, false);
}
path->intent_ref++;
@@ -2083,7 +2092,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
closure_sync(&cl);
} while (ret);
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, false);
bch2_btree_cache_cannibalize_unlock(c);
set_btree_node_fake(b);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 8dc86fa636d6..e72eb8795616 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -76,8 +76,10 @@ struct btree_update {
struct journal_entry_pin journal;
/* Preallocated nodes we reserve when we start the update: */
- struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX];
- unsigned nr_prealloc_nodes;
+ struct prealloc_nodes {
+ struct btree *b[BTREE_UPDATE_NODES_MAX];
+ unsigned nr;
+ } prealloc_nodes[2];
/* Nodes being freed: */
struct keylist old_keys;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 1fff03d301a9..457fcee7d8e1 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -443,6 +443,11 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
bch2_flags_to_text(out, bch2_btree_node_flags, b->flags);
pr_newline(out);
+ pr_buf(out, "pcpu read locks: ");
+ pr_tab(out);
+ pr_buf(out, "%u", b->c.lock.readers != NULL);
+ pr_newline(out);
+
pr_buf(out, "written:");
pr_tab(out);
pr_buf(out, "%u", b->written);