summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2022-08-22 13:23:47 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2022-09-19 14:37:23 -0400
commit61c417e812474e657e1fc9d90756c0b00268cedd (patch)
tree136376226e088d5c7a2f51b99cebd41a6b86f0b4
parentcf86d180bf417a56ad670b0f8cdcfe3b915c87b4 (diff)
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy. The btree iterator API provides an interface where the user doesn't need to concern themselves with lock ordering - different btree iterators can be traversed in any order. Without special care, this will lead to deadlocks. Our previous strategy was to define a lock ordering internally, and whenever we attempt to take a lock and trylock() fails, we'd check if the current btree transaction is holding any locks that cause a lock ordering violation. If so, we'd issue a transaction restart, and then bch2_trans_begin() would re-traverse all previously used iterators, but in the correct order. That approach had some issues, though. - Sometimes we'd issue transaction restarts unnecessarily, when no deadlock would have actually occured. Lock ordering restarts have become our primary cause of transaction restarts, on some workloads totally 20% of actual transaction commits. - To avoid deadlock or livelock, we'd often have to take intent locks when we only wanted a read lock: with the lock ordering approach, it is actually illegal to hold _any_ read lock while blocking on an intent lock, and this has been causing us unnecessary lock contention. - It was getting fragile - the various lock ordering rules are not trivial, and we'd been seeing occasional livelock issues related to this machinery. So, since bcachefs is already a relational database masquerading as a filesystem, we're stealing the next traditional database technique and switching to a cycle detector for avoiding deadlocks. When we block taking a btree lock, after adding ourself to the waitlist but before sleeping, we do a DFS of btree transactions waiting on other btree transactions, starting with the current transaction and walking our held locks, and transactions blocking on our held locks. If we find a cycle, we emit a transaction restart. Occasionally (e.g. the btree split path) we can not allow the lock() operation to fail, so if necessary we'll tell another transaction that it has to fail. Result: trans_restart_would_deadlock events are reduced by a factor of 10 to 100, and we'll be able to delete a whole bunch of grotty, fragile code. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--fs/bcachefs/btree_iter.c11
-rw-r--r--fs/bcachefs/btree_iter.h7
-rw-r--r--fs/bcachefs/btree_locking.c221
-rw-r--r--fs/bcachefs/btree_locking.h62
-rw-r--r--fs/bcachefs/btree_types.h11
-rw-r--r--fs/bcachefs/debug.c6
-rw-r--r--fs/bcachefs/errcode.h1
7 files changed, 287 insertions, 32 deletions
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c833aa2f235f..794a8a4f0c2c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2847,8 +2847,9 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *
trans->c = c;
trans->fn = fn;
trans->last_begin_time = ktime_get_ns();
- trans->task = current;
trans->fn_idx = bch2_trans_get_fn_idx(trans, c, fn);
+ trans->locking_wait.task = current;
+ closure_init_stack(&trans->ref);
bch2_trans_alloc_paths(trans, c);
@@ -2869,7 +2870,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *
mutex_lock(&c->btree_trans_lock);
list_for_each_entry(pos, &c->btree_trans_list, list) {
- if (trans->task->pid < pos->task->pid) {
+ if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
list_add_tail(&trans->list, &pos->list);
goto list_add_done;
}
@@ -2910,6 +2911,8 @@ void bch2_trans_exit(struct btree_trans *trans)
bch2_trans_unlock(trans);
+ closure_sync(&trans->ref);
+
if (s)
s->max_mem = max(s->max_mem, trans->mem_max);
@@ -2985,7 +2988,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
static char lock_types[] = { 'r', 'i', 'w' };
unsigned l;
- prt_printf(out, "%i %s\n", trans->task->pid, trans->fn);
+ prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
trans_for_each_path(trans, path) {
if (!path->nodes_locked)
@@ -3017,7 +3020,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
trans->locking_path_idx,
path->cached ? 'c' : 'b',
trans->locking_level,
- lock_types[trans->locking_lock_type],
+ lock_types[trans->locking_wait.lock_want],
bch2_btree_ids[trans->locking_btree_id]);
bch2_bpos_to_text(out, trans->locking_pos);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index c464a4b8b616..9594b1689f20 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -74,11 +74,14 @@ __trans_next_path(struct btree_trans *trans, unsigned idx)
void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
-#define trans_for_each_path(_trans, _path) \
- for (_path = __trans_next_path((_trans), 0); \
+#define trans_for_each_path_from(_trans, _path, _start) \
+ for (_path = __trans_next_path((_trans), _start); \
(_path); \
_path = __trans_next_path((_trans), (_path)->idx + 1))
+#define trans_for_each_path(_trans, _path) \
+ trans_for_each_path_from(_trans, _path, 0)
+
static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
{
unsigned idx = path ? path->sorted_idx + 1 : 0;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 8e8e0e085fb3..96df4391dbfa 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -52,10 +52,223 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans,
/* lock */
-void __bch2_btree_node_lock_write(struct btree_trans *trans,
- struct btree_bkey_cached_common *b)
+/*
+ * @trans wants to lock @b with type @type
+ */
+struct trans_waiting_for_lock {
+ struct btree_trans *trans;
+ struct btree_bkey_cached_common *node_want;
+ enum six_lock_type lock_want;
+
+ /* for iterating over held locks :*/
+ u8 path_idx;
+ u8 level;
+ u64 lock_start_time;
+};
+
+struct lock_graph {
+ struct trans_waiting_for_lock g[4];
+ unsigned nr;
+};
+
+static void lock_graph_pop(struct lock_graph *g)
+{
+ closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static noinline int break_cycle(struct lock_graph *g)
+{
+ struct btree_trans *orig_trans = g->g->trans;
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g; i < g->g + g->nr; i++) {
+ if (i->trans->lock_may_not_fail)
+ continue;
+
+ if (i == g->g)
+ return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_would_deadlock);
+
+ i->trans->lock_must_abort = true;
+ wake_up_process(i->trans->locking_wait.task);
+ return 1;
+ }
+
+ BUG();
+}
+
+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans)
+{
+ struct btree_trans *orig_trans = g->g->trans;
+ struct trans_waiting_for_lock *i;
+ int ret = 0;
+
+ for (i = g->g; i < g->g + g->nr; i++) {
+ if (i->trans->locking != i->node_want)
+ while (g->g + g->nr >= i) {
+ lock_graph_pop(g);
+ return 0;
+ }
+
+ if (i->trans == trans) {
+ ret = break_cycle(g);
+ goto deadlock;
+ }
+ }
+
+ if (g->nr == ARRAY_SIZE(g->g)) {
+ ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
+ goto deadlock;
+ }
+
+ closure_get(&trans->ref);
+
+ g->g[g->nr++] = (struct trans_waiting_for_lock) {
+ .trans = trans,
+ .node_want = trans->locking,
+ .lock_want = trans->locking_wait.lock_want,
+ };
+
+ return 0;
+deadlock:
+ while (g->nr)
+ lock_graph_pop(g);
+ return ret;
+}
+
+#if 0
+static void print_cycle(struct printbuf *out, struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ prt_str(out, "Found lock cycle:");
+ prt_newline(out);
+
+ for (i = g->g; i < g->g + g->nr; i++)
+ bch2_btree_trans_to_text(out, i->trans);
+}
+#endif
+
+static noinline void lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g; i < g->g + g->nr; i++)
+ if (i->trans->locking != i->node_want)
+ while (g->g + g->nr >= i)
+ lock_graph_pop(g);
+ BUG();
+}
+
+static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
+{
+ return t1 + t2 > 1;
+}
+
+static int check_for_deadlock(struct btree_trans *trans)
+{
+ struct lock_graph g;
+ struct trans_waiting_for_lock *top;
+ struct btree_bkey_cached_common *b;
+ struct btree_path *path;
+ int ret;
+
+ if (trans->lock_must_abort)
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+
+ g.nr = 0;
+ ret = lock_graph_descend(&g, trans);
+ BUG_ON(ret);
+next:
+ if (!g.nr)
+ return 0;
+
+ top = &g.g[g.nr - 1];
+
+ trans_for_each_path_from(top->trans, path, top->path_idx) {
+ if (!path->nodes_locked)
+ continue;
+
+ if (top->path_idx != path->idx) {
+ top->path_idx = path->idx;
+ top->level = 0;
+ top->lock_start_time = 0;
+ }
+
+ for (;
+ top->level < BTREE_MAX_DEPTH;
+ top->level++, top->lock_start_time = 0) {
+ int lock_held = btree_node_locked_type(path, top->level);
+ struct btree_trans *prev = NULL;
+
+ if (lock_held == BTREE_NODE_UNLOCKED)
+ continue;
+
+ b = &READ_ONCE(path->l[top->level].b)->c;
+
+ if (unlikely(IS_ERR_OR_NULL(b))) {
+ lock_graph_remove_non_waiters(&g);
+ goto next;
+ }
+
+ if (list_empty_careful(&b->lock.wait_list))
+ continue;
+
+ raw_spin_lock(&b->lock.wait_lock);
+ list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
+ BUG_ON(b != trans->locking);
+
+ /*
+ * We need transactions on waitlists to be
+ * strictly ordered by lock_start_time, since we
+ * use it as a cursor, Since adding to the
+ * waitlist is done in the six lock code and
+ * might race, fix up if necessary here:
+ */
+ if (prev &&
+ time_before_eq64(trans->lock_start_time, prev->lock_start_time))
+ trans->lock_start_time = prev->lock_start_time + 1;
+ prev = trans;
+
+ if (top->lock_start_time &&
+ time_after_eq64(top->lock_start_time, trans->lock_start_time))
+ continue;
+
+ top->lock_start_time = trans->lock_start_time;
+
+ /* Don't check for self deadlock: */
+ if (trans == top->trans ||
+ !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
+ continue;
+
+ ret = lock_graph_descend(&g, trans);
+ raw_spin_unlock(&b->lock.wait_lock);
+
+ if (ret)
+ return ret < 0 ? ret : 0;
+ goto next;
+
+ }
+ raw_spin_unlock(&b->lock.wait_lock);
+ }
+ }
+
+ lock_graph_pop(&g);
+ goto next;
+}
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
+{
+ struct btree_trans *trans = p;
+
+ return check_for_deadlock(trans);
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ bool lock_may_not_fail)
{
int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
+ int ret;
/*
* Must drop our read locks before calling six_lock_write() -
@@ -64,8 +277,10 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans,
* locked:
*/
six_lock_readers_add(&b->lock, -readers);
- btree_node_lock_nopath_nofail(trans, b, SIX_LOCK_write);
+ ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail);
six_lock_readers_add(&b->lock, readers);
+
+ return ret;
}
static inline bool path_has_read_locks(struct btree_path *path)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 7d35fad1340d..0f79d6e546b3 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -184,22 +184,37 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
void bch2_btree_node_unlock_write(struct btree_trans *,
struct btree_path *, struct btree *);
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
+
/* lock: */
+static inline int __btree_node_lock_nopath(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type,
+ bool lock_may_not_fail)
+{
+ trans->lock_may_not_fail = lock_may_not_fail;
+ trans->locking = b;
+ trans->lock_start_time = local_clock();
+ trans->lock_must_abort = false;
+
+ return six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans);
+}
+
static inline int __must_check
btree_node_lock_nopath(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
enum six_lock_type type)
{
- six_lock_type(&b->lock, type, NULL, NULL);
- return 0;
+ return __btree_node_lock_nopath(trans, b, type, false);
}
static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
enum six_lock_type type)
{
- int ret = btree_node_lock_nopath(trans, b, type);
+ int ret = __btree_node_lock_nopath(trans, b, type, true);
BUG_ON(ret);
}
@@ -211,8 +226,6 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
- int ret;
-
if (six_trylock_type(&b->lock, type))
return 0;
@@ -220,11 +233,11 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
trans->locking_pos = pos;
trans->locking_btree_id = path->btree_id;
trans->locking_level = level;
- trans->locking_lock_type = type;
+ trans->lock_may_not_fail = false;
trans->locking = b;
- ret = six_lock_type(&b->lock, type, should_sleep_fn, p);
- trans->locking = NULL;
- return ret;
+ trans->lock_start_time = local_clock();
+ return six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans);
}
/*
@@ -280,12 +293,15 @@ static inline int btree_node_lock(struct btree_trans *trans,
return ret;
}
-void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *);
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *, bool);
-static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_bkey_cached_common *b)
+static inline int __btree_node_lock_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b,
+ bool lock_may_not_fail)
{
+ int ret;
+
EBUG_ON(&path->l[b->level].b->c != b);
EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
EBUG_ON(!btree_node_intent_locked(path, b->level));
@@ -297,8 +313,21 @@ static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
*/
mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
- if (unlikely(!six_trylock_write(&b->lock)))
- __bch2_btree_node_lock_write(trans, b);
+ ret = likely(six_trylock_write(&b->lock))
+ ? 0
+ : __bch2_btree_node_lock_write(trans, b, lock_may_not_fail);
+ if (ret)
+ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
+
+ return ret;
+}
+
+static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b)
+{
+ int ret = __btree_node_lock_write(trans, path, b, true);
+ BUG_ON(ret);
}
static inline int __must_check
@@ -306,8 +335,7 @@ bch2_btree_node_lock_write(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
- bch2_btree_node_lock_write_nofail(trans, path, b);
- return 0;
+ return __btree_node_lock_write(trans, path, b, false);
}
/* relock: */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 7c01663721ed..33103d20ab09 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -389,15 +389,20 @@ struct btree_trans_commit_hook {
struct btree_trans {
struct bch_fs *c;
const char *fn;
+ struct closure ref;
struct list_head list;
u64 last_begin_time;
- struct btree_bkey_cached_common *locking;
+
unsigned locking_path_idx;
struct bpos locking_pos;
u8 locking_btree_id;
u8 locking_level;
- u8 locking_lock_type;
- struct task_struct *task;
+ u8 lock_may_not_fail;
+ u8 lock_must_abort;
+ u64 lock_start_time;
+ struct btree_bkey_cached_common *locking;
+ struct six_lock_waiter locking_wait;
+
int srcu_idx;
u8 fn_idx;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index fb518d59a134..55c05d234c30 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -534,7 +534,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
mutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
- if (trans->task->pid <= i->iter)
+ if (trans->locking_wait.task->pid <= i->iter)
continue;
ret = flush_buf(i);
@@ -546,11 +546,11 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
prt_printf(&i->buf, "backtrace:");
prt_newline(&i->buf);
printbuf_indent_add(&i->buf, 2);
- prt_backtrace(&i->buf, trans->task);
+ prt_backtrace(&i->buf, trans->locking_wait.task);
printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf);
- i->iter = trans->task->pid;
+ i->iter = trans->locking_wait.task->pid;
}
mutex_unlock(&c->btree_trans_lock);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 135d9eba6203..905b587c8862 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -35,6 +35,7 @@
x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \
x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \
x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\
+ x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\
x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \