summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-07-04 23:38:55 -0800
committerKent Overstreet <kent.overstreet@gmail.com>2017-01-18 21:39:40 -0900
commitb210aed6d95bfc6c7adc66878c9dffd01eb8c17a (patch)
tree2808a5fdf0a0c26d266594ad939bf59131dc4da6
parentf1c9ab978f6d01d32a92d081610ed8946eb3c48c (diff)
bcache: locking fixes
-rw-r--r--drivers/md/bcache/btree_cache.c29
-rw-r--r--drivers/md/bcache/btree_iter.c119
-rw-r--r--drivers/md/bcache/btree_update.c23
-rw-r--r--drivers/md/bcache/dirent.c10
-rw-r--r--drivers/md/bcache/str_hash.h12
5 files changed, 119 insertions, 74 deletions
diff --git a/drivers/md/bcache/btree_cache.c b/drivers/md/bcache/btree_cache.c
index fe50ddd5d799..f8f4b7e1b48a 100644
--- a/drivers/md/bcache/btree_cache.c
+++ b/drivers/md/bcache/btree_cache.c
@@ -638,6 +638,11 @@ retry:
rcu_read_unlock();
if (unlikely(!b)) {
+ /*
+ * We must have the parent locked to call bch_btree_node_fill(),
+ * else we could read in a btree node from disk that's been
+ * freed:
+ */
b = bch_btree_node_fill(iter, k, level, cl);
/* We raced and found the btree node in the cache */
@@ -648,6 +653,13 @@ retry:
BUG_ON(PTR_ERR(b) != -EAGAIN);
return b;
}
+
+ /*
+ * But we still have to drop read locks before we return, for
+ * deadlock avoidance:
+ */
+ if (btree_node_read_locked(iter, level + 1))
+ btree_node_unlock(iter, level + 1);
} else {
/*
* There's a potential deadlock with splits and insertions into
@@ -665,14 +677,17 @@ retry:
* we're starting to take intent locks - and handle the race.
*
* The race is that they might be about to free the node we
- * want, and dropping our read lock lets them add the
- * replacement node's pointer to the parent and then free the
- * old node (the node we're trying to lock).
+ * want, and dropping our read lock on the parent node lets them
+ * update the parent marking the node we want as freed, and then
+ * free it:
+ *
+ * To guard against this, btree nodes are evicted from the cache
+ * when they're freed - and PTR_HASH() is zeroed out, which will
+ * cause our btree_node_lock() call below to fail.
*
- * After we take the intent lock on the node we want (which
- * protects against it being freed), we check if we might have
- * raced (and the node was freed before we locked it) with a
- * global sequence number for freed btree nodes.
+ * Then, btree_node_relock() on the parent will fail - because
+ * the parent was modified, when the pointer to the node we want
+ * was removed - and we'll bail out:
*/
if (btree_node_read_locked(iter, level + 1))
btree_node_unlock(iter, level + 1);
diff --git a/drivers/md/bcache/btree_iter.c b/drivers/md/bcache/btree_iter.c
index 5a2e2b974339..97dfb7992a1c 100644
--- a/drivers/md/bcache/btree_iter.c
+++ b/drivers/md/bcache/btree_iter.c
@@ -89,6 +89,7 @@ void __btree_node_lock_write(struct btree *b, struct btree_iter *iter)
static bool btree_lock_upgrade(struct btree_iter *iter, unsigned level)
{
+ struct btree_iter *linked;
struct btree *b = iter->nodes[level];
if (btree_node_intent_locked(iter, level))
@@ -99,14 +100,25 @@ static bool btree_lock_upgrade(struct btree_iter *iter, unsigned level)
if (btree_node_locked(iter, level)
? six_trylock_convert(&b->lock, SIX_LOCK_read, SIX_LOCK_intent)
- : six_relock_intent(&b->lock, iter->lock_seq[level])) {
- mark_btree_node_intent_locked(iter, level);
- trace_bcache_btree_upgrade_lock(b, iter);
- return true;
- }
+ : six_relock_intent(&b->lock, iter->lock_seq[level]))
+ goto success;
+
+ for_each_linked_btree_iter(iter, linked)
+ if (linked->nodes[level] == b &&
+ btree_node_intent_locked(linked, level) &&
+ iter->lock_seq[level] == b->lock.state.seq) {
+ btree_node_unlock(iter, level);
+ six_lock_increment(&b->lock, SIX_LOCK_intent);
+ goto success;
+ }
+
trace_bcache_btree_upgrade_lock_fail(b, iter);
return false;
+success:
+ mark_btree_node_intent_locked(iter, level);
+ trace_bcache_btree_upgrade_lock(b, iter);
+ return true;
}
/* Btree iterator locking: */
@@ -148,19 +160,31 @@ int bch_btree_iter_unlock(struct btree_iter *iter)
bool btree_node_relock(struct btree_iter *iter, unsigned level)
{
+ struct btree_iter *linked;
struct btree *b = iter->nodes[level];
enum six_lock_type type = btree_lock_want(iter, level);
if (btree_node_locked(iter, level))
return true;
- if (!race_fault() &&
- is_btree_node(iter, level) &&
+ if (race_fault())
+ return false;
+
+ if (is_btree_node(iter, level) &&
six_relock_type(&b->lock, iter->lock_seq[level], type)) {
mark_btree_node_locked(iter, level, type);
return true;
}
+ for_each_linked_btree_iter(iter, linked)
+ if (linked->nodes[level] == b &&
+ btree_node_locked_type(linked, level) == type &&
+ iter->lock_seq[level] == b->lock.state.seq) {
+ six_lock_increment(&b->lock, type);
+ mark_btree_node_locked(iter, level, type);
+ return true;
+ }
+
return false;
}
@@ -365,6 +389,40 @@ void bch_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
iter->pos, iter->is_extents);
}
+static void btree_iter_verify_locking(struct btree_iter *iter, unsigned level)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+ struct btree_iter *linked;
+
+ if (!btree_want_intent(iter, level))
+ return;
+
+ /*
+ * Can't hold _any_ read locks (including in linked iterators) when
+ * taking intent locks, that leads to a fun deadlock involving write
+ * locks and journal reservations
+ *
+ * We could conceivably drop read locks, then retake them and if
+ * retaking fails then return -EINTR... but, let's keep things simple
+ * for now:
+ */
+ BUG_ON(iter->nodes_locked != iter->nodes_intent_locked);
+
+ for_each_linked_btree_iter(iter, linked)
+ BUG_ON(linked->nodes_locked != linked->nodes_intent_locked);
+
+ /* Lock ordering: */
+ for_each_linked_btree_iter(iter, linked) {
+ BUG_ON(linked->nodes_locked &&
+ btree_iter_cmp(linked, iter) > 0);
+
+ BUG_ON(linked->nodes_locked &&
+ linked->btree_id == iter->btree_id &&
+ level > __fls(linked->nodes_locked));
+ }
+#endif
+}
+
static inline void btree_iter_lock_root(struct btree_iter *iter, struct bpos pos)
{
struct cache_set *c = iter->c;
@@ -378,6 +436,8 @@ static inline void btree_iter_lock_root(struct btree_iter *iter, struct bpos pos
iter->level = b->level;
+ btree_iter_verify_locking(iter, iter->level);
+
if (btree_node_lock(b, iter, iter->level,
(b != c->btree_roots[iter->btree_id].b))) {
btree_iter_node_set(iter, b, pos);
@@ -399,6 +459,8 @@ static inline int btree_iter_down(struct btree_iter *iter, struct bpos pos,
if (unlikely(IS_ERR(b)))
return PTR_ERR(b);
+ btree_iter_verify_locking(iter, iter->level - 1);
+
--iter->level;
btree_iter_node_set(iter, b, pos);
return 0;
@@ -409,42 +471,6 @@ static void btree_iter_up(struct btree_iter *iter)
btree_node_unlock(iter, iter->level++);
}
-static void btree_iter_verify_locking(struct btree_iter *iter)
-{
-#ifdef CONFIG_BCACHE_DEBUG
- struct btree_iter *linked;
- unsigned level;
-
- /*
- * Can't hold _any_ read locks (including in linked iterators) when
- * taking intent locks, that leads to a fun deadlock involving write
- * locks and journal reservations
- *
- * We could conceivably drop read locks, then retake them and if
- * retaking fails then return -EINTR... but, let's keep things simple
- * for now:
- */
- for_each_linked_btree_iter(iter, linked)
- for (level = 0; level < BTREE_MAX_DEPTH; level++)
- BUG_ON(btree_node_read_locked(linked, level));
-
- /* Lock ordering: */
- for_each_linked_btree_iter(iter, linked)
- BUG_ON(btree_iter_cmp(iter, linked) < 0 &&
- btree_node_intent_locked(linked, 0));
-
- /*
- * Also, we have to take intent locks on interior nodes before leaf
- * nodes - verify that linked iterators don't have intent locks held at
- * depths lower than where we're at:
- */
- for_each_linked_btree_iter(iter, linked)
- BUG_ON(btree_iter_cmp(linked, iter) <= 0 &&
- linked->locks_want > 0 &&
- linked->locks_want < iter->locks_want);
-#endif
-}
-
/*
* This is the main state machine for walking down the btree - walks down to a
* specified depth
@@ -483,9 +509,6 @@ retry:
__btree_iter_advance(iter);
}
- if (iter->locks_want > 0)
- btree_iter_verify_locking(iter);
-
/*
* Note: iter->nodes[iter->level] may be temporarily NULL here - that
* would indicate to other code that we got to the end of the btree,
@@ -756,6 +779,8 @@ void bch_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
memcpy(dst->node_iters,
src->node_iters,
sizeof(src->node_iters));
+
+ bch_btree_iter_upgrade(dst);
}
void bch_btree_iter_init_copy(struct btree_iter *dst, struct btree_iter *src)
@@ -765,4 +790,6 @@ void bch_btree_iter_init_copy(struct btree_iter *dst, struct btree_iter *src)
dst->nodes_locked = 0;
dst->nodes_intent_locked = 0;
bch_btree_iter_link(src, dst);
+
+ bch_btree_iter_upgrade(dst);
}
diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c
index 025a29d96138..f90bbe261792 100644
--- a/drivers/md/bcache/btree_update.c
+++ b/drivers/md/bcache/btree_update.c
@@ -1484,8 +1484,7 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags,
}
reserve = bch_btree_reserve_get(c, b, 0,
- !(flags & BTREE_INSERT_NOFAIL),
- cl);
+ !(flags & BTREE_INSERT_NOFAIL), cl);
if (IS_ERR(reserve)) {
ret = PTR_ERR(reserve);
goto out_get_locks;
@@ -1495,15 +1494,29 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags,
btree_split(b, iter, NULL, reserve, as);
bch_btree_reserve_put(c, reserve);
+
iter->locks_want = 1;
+
+ for_each_linked_btree_iter(iter, linked)
+ if (linked->btree_id == iter->btree_id &&
+ btree_iter_cmp(linked, iter) <= 0)
+ linked->locks_want = 1;
out:
up_read(&c->gc_lock);
return ret;
out_get_locks:
/* Lock ordering... */
for_each_linked_btree_iter(iter, linked)
- if (btree_iter_cmp(linked, iter) <= 0)
+ if (linked->btree_id == iter->btree_id &&
+ btree_iter_cmp(linked, iter) <= 0) {
+ unsigned i;
+
+ for (i = 0; i < BTREE_MAX_DEPTH; i++) {
+ btree_node_unlock(linked, i);
+ linked->lock_seq[i]--;
+ }
linked->locks_want = U8_MAX;
+ }
goto out;
}
@@ -1679,10 +1692,6 @@ retry:
trans_for_each_entry(trans, i)
if (!same_leaf_as_prev(trans, i))
bch_btree_node_write_lazy(i->iter->nodes[0], i->iter);
-
- trans_for_each_entry(trans, i)
- i->iter->locks_want = 1;
-
out:
percpu_ref_put(&c->writes);
return ret;
diff --git a/drivers/md/bcache/dirent.c b/drivers/md/bcache/dirent.c
index f106513928ca..25e70cc9e7e5 100644
--- a/drivers/md/bcache/dirent.c
+++ b/drivers/md/bcache/dirent.c
@@ -221,16 +221,6 @@ int bch_dirent_rename(struct cache_set *c,
do {
/*
- * When taking intent locks, we have to take interior node locks
- * before leaf node locks; if the second iter we traverse has
- * locks_want > the first iter, we could end up taking an intent
- * lock on an interior node after traversing the first iterator
- * only took an intent lock on a leaf.
- */
- src_iter.locks_want = dst_iter.locks_want =
- max(src_iter.locks_want, dst_iter.locks_want);
-
- /*
* Have to traverse lower btree nodes before higher - due to
* lock ordering.
*/
diff --git a/drivers/md/bcache/str_hash.h b/drivers/md/bcache/str_hash.h
index 6bdd9f26c4e6..d296e9e92894 100644
--- a/drivers/md/bcache/str_hash.h
+++ b/drivers/md/bcache/str_hash.h
@@ -256,7 +256,7 @@ static inline int bch_hash_set(const struct bch_hash_desc desc,
bch_btree_iter_init_intent(&hashed_slot, c, desc.btree_id,
POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))));
- bch_btree_iter_init_intent(&iter, c, 0, POS_MIN);
+ bch_btree_iter_init_intent(&iter, c, desc.btree_id, hashed_slot.pos);
bch_btree_iter_link(&hashed_slot, &iter);
do {
@@ -301,12 +301,16 @@ static inline int bch_hash_set(const struct bch_hash_desc desc,
insert->k.p = iter.pos;
ret = bch_btree_insert_at(&iter, insert, NULL, NULL,
journal_seq, BTREE_INSERT_ATOMIC);
- /* On successful insert, we don't want to clobber ret with error
- * from iter:
- */
+
+ /* unlock before traversing hashed_slot: */
bch_btree_iter_unlock(&iter);
} while (ret == -EINTR);
+ /*
+ * On successful insert, we don't want to clobber ret with error from
+ * iter:
+ */
+ bch_btree_iter_unlock(&iter);
bch_btree_iter_unlock(&hashed_slot);
return ret;
err: