summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2021-05-26 01:03:48 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2021-05-30 15:04:49 -0400
commit397bb8493e462d35e173fcffe6ff062ec0539101 (patch)
tree127bbaa58e728e9b992f4885d9064941b98ed94a
parent68a095b6760e176e1b202e44969070383d7b7e98 (diff)
bcachefs: Fix a deadlock
Waiting on a btree node write with btree locks held can deadlock, if the write errors: the write error path has to do do a btree update to drop the pointer to the replica that errored. The interior update path has to wait on in flight btree writes before freeing nodes on disk. Previously, this was done in bch2_btree_interior_update_will_free_node(), and could deadlock; now, we just stash a pointer to the node and do it in btree_update_nodes_written(), just prior to the transactional part of the update.
-rw-r--r--fs/bcachefs/btree_io.c4
-rw-r--r--fs/bcachefs/btree_update_interior.c26
-rw-r--r--fs/bcachefs/btree_update_interior.h4
3 files changed, 27 insertions, 7 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 38bfad2566f8..47cfd8a08f91 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1717,6 +1717,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
return;
if (old & (1 << BTREE_NODE_write_in_flight)) {
+ /*
+ * XXX waiting on btree writes with btree locks held -
+ * this can deadlock, and we hit the write error path
+ */
btree_node_wait_on_io(b);
continue;
}
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c55df177d7f2..b0484c7acb79 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -551,6 +551,22 @@ static void btree_update_nodes_written(struct btree_update *as)
BUG_ON(!journal_pin_active(&as->journal));
/*
+ * Wait for any in flight writes to finish before we free the old nodes
+ * on disk:
+ */
+ for (i = 0; i < as->nr_old_nodes; i++) {
+ struct btree_node *bn = READ_ONCE(as->old_nodes[i]->data);
+
+ /*
+ * This is technically a use after free, but it's just a read -
+ * but it might cause problems in userspace where freeing the
+ * buffer may unmap it:
+ */
+ if (bn && bn->keys.seq == as->old_nodes_seq[i])
+ btree_node_wait_on_io(as->old_nodes[i]);
+ }
+
+ /*
* We did an update to a parent node where the pointers we added pointed
* to child nodes that weren't written yet: now, the child nodes have
* been written so we can write out the update to the interior node.
@@ -889,13 +905,9 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
btree_update_will_delete_key(as, &b->key);
- /*
- * XXX: Waiting on io with btree node locks held, we don't want to be
- * doing this. We can't have btree writes happening after the space has
- * been freed, but we really only need to block before
- * btree_update_nodes_written_trans() happens.
- */
- btree_node_wait_on_io(b);
+ as->old_nodes[as->nr_old_nodes] = b;
+ as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
+ as->nr_old_nodes++;
}
void bch2_btree_update_done(struct btree_update *as)
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 7eef3dbb6ef1..7ed67b47e1b9 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -92,6 +92,10 @@ struct btree_update {
struct btree *new_nodes[BTREE_UPDATE_NODES_MAX];
unsigned nr_new_nodes;
+ struct btree *old_nodes[BTREE_UPDATE_NODES_MAX];
+ __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX];
+ unsigned nr_old_nodes;
+
open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX *
BCH_REPLICAS_MAX];
open_bucket_idx_t nr_open_buckets;