diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-07-04 23:38:55 -0800 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-01-18 21:39:40 -0900 |
commit | b210aed6d95bfc6c7adc66878c9dffd01eb8c17a (patch) | |
tree | 2808a5fdf0a0c26d266594ad939bf59131dc4da6 | |
parent | f1c9ab978f6d01d32a92d081610ed8946eb3c48c (diff) |
bcache: locking fixes
-rw-r--r-- | drivers/md/bcache/btree_cache.c | 29 | ||||
-rw-r--r-- | drivers/md/bcache/btree_iter.c | 119 | ||||
-rw-r--r-- | drivers/md/bcache/btree_update.c | 23 | ||||
-rw-r--r-- | drivers/md/bcache/dirent.c | 10 | ||||
-rw-r--r-- | drivers/md/bcache/str_hash.h | 12 |
5 files changed, 119 insertions, 74 deletions
diff --git a/drivers/md/bcache/btree_cache.c b/drivers/md/bcache/btree_cache.c index fe50ddd5d799..f8f4b7e1b48a 100644 --- a/drivers/md/bcache/btree_cache.c +++ b/drivers/md/bcache/btree_cache.c @@ -638,6 +638,11 @@ retry: rcu_read_unlock(); if (unlikely(!b)) { + /* + * We must have the parent locked to call bch_btree_node_fill(), + * else we could read in a btree node from disk that's been + * freed: + */ b = bch_btree_node_fill(iter, k, level, cl); /* We raced and found the btree node in the cache */ @@ -648,6 +653,13 @@ retry: BUG_ON(PTR_ERR(b) != -EAGAIN); return b; } + + /* + * But we still have to drop read locks before we return, for + * deadlock avoidance: + */ + if (btree_node_read_locked(iter, level + 1)) + btree_node_unlock(iter, level + 1); } else { /* * There's a potential deadlock with splits and insertions into @@ -665,14 +677,17 @@ retry: * we're starting to take intent locks - and handle the race. * * The race is that they might be about to free the node we - * want, and dropping our read lock lets them add the - * replacement node's pointer to the parent and then free the - * old node (the node we're trying to lock). + * want, and dropping our read lock on the parent node lets them + * update the parent marking the node we want as freed, and then + * free it: + * + * To guard against this, btree nodes are evicted from the cache + * when they're freed - and PTR_HASH() is zeroed out, which will + * cause our btree_node_lock() call below to fail. * - * After we take the intent lock on the node we want (which - * protects against it being freed), we check if we might have - * raced (and the node was freed before we locked it) with a - * global sequence number for freed btree nodes. + * Then, btree_node_relock() on the parent will fail - because + * the parent was modified, when the pointer to the node we want + * was removed - and we'll bail out: */ if (btree_node_read_locked(iter, level + 1)) btree_node_unlock(iter, level + 1); diff --git a/drivers/md/bcache/btree_iter.c b/drivers/md/bcache/btree_iter.c index 5a2e2b974339..97dfb7992a1c 100644 --- a/drivers/md/bcache/btree_iter.c +++ b/drivers/md/bcache/btree_iter.c @@ -89,6 +89,7 @@ void __btree_node_lock_write(struct btree *b, struct btree_iter *iter) static bool btree_lock_upgrade(struct btree_iter *iter, unsigned level) { + struct btree_iter *linked; struct btree *b = iter->nodes[level]; if (btree_node_intent_locked(iter, level)) @@ -99,14 +100,25 @@ static bool btree_lock_upgrade(struct btree_iter *iter, unsigned level) if (btree_node_locked(iter, level) ? six_trylock_convert(&b->lock, SIX_LOCK_read, SIX_LOCK_intent) - : six_relock_intent(&b->lock, iter->lock_seq[level])) { - mark_btree_node_intent_locked(iter, level); - trace_bcache_btree_upgrade_lock(b, iter); - return true; - } + : six_relock_intent(&b->lock, iter->lock_seq[level])) + goto success; + + for_each_linked_btree_iter(iter, linked) + if (linked->nodes[level] == b && + btree_node_intent_locked(linked, level) && + iter->lock_seq[level] == b->lock.state.seq) { + btree_node_unlock(iter, level); + six_lock_increment(&b->lock, SIX_LOCK_intent); + goto success; + } + trace_bcache_btree_upgrade_lock_fail(b, iter); return false; +success: + mark_btree_node_intent_locked(iter, level); + trace_bcache_btree_upgrade_lock(b, iter); + return true; } /* Btree iterator locking: */ @@ -148,19 +160,31 @@ int bch_btree_iter_unlock(struct btree_iter *iter) bool btree_node_relock(struct btree_iter *iter, unsigned level) { + struct btree_iter *linked; struct btree *b = iter->nodes[level]; enum six_lock_type type = btree_lock_want(iter, level); if (btree_node_locked(iter, level)) return true; - if (!race_fault() && - is_btree_node(iter, level) && + if (race_fault()) + return false; + + if (is_btree_node(iter, level) && six_relock_type(&b->lock, iter->lock_seq[level], type)) { mark_btree_node_locked(iter, level, type); return true; } + for_each_linked_btree_iter(iter, linked) + if (linked->nodes[level] == b && + btree_node_locked_type(linked, level) == type && + iter->lock_seq[level] == b->lock.state.seq) { + six_lock_increment(&b->lock, type); + mark_btree_node_locked(iter, level, type); + return true; + } + return false; } @@ -365,6 +389,40 @@ void bch_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) iter->pos, iter->is_extents); } +static void btree_iter_verify_locking(struct btree_iter *iter, unsigned level) +{ +#ifdef CONFIG_BCACHE_DEBUG + struct btree_iter *linked; + + if (!btree_want_intent(iter, level)) + return; + + /* + * Can't hold _any_ read locks (including in linked iterators) when + * taking intent locks, that leads to a fun deadlock involving write + * locks and journal reservations + * + * We could conceivably drop read locks, then retake them and if + * retaking fails then return -EINTR... but, let's keep things simple + * for now: + */ + BUG_ON(iter->nodes_locked != iter->nodes_intent_locked); + + for_each_linked_btree_iter(iter, linked) + BUG_ON(linked->nodes_locked != linked->nodes_intent_locked); + + /* Lock ordering: */ + for_each_linked_btree_iter(iter, linked) { + BUG_ON(linked->nodes_locked && + btree_iter_cmp(linked, iter) > 0); + + BUG_ON(linked->nodes_locked && + linked->btree_id == iter->btree_id && + level > __fls(linked->nodes_locked)); + } +#endif +} + static inline void btree_iter_lock_root(struct btree_iter *iter, struct bpos pos) { struct cache_set *c = iter->c; @@ -378,6 +436,8 @@ static inline void btree_iter_lock_root(struct btree_iter *iter, struct bpos pos iter->level = b->level; + btree_iter_verify_locking(iter, iter->level); + if (btree_node_lock(b, iter, iter->level, (b != c->btree_roots[iter->btree_id].b))) { btree_iter_node_set(iter, b, pos); @@ -399,6 +459,8 @@ static inline int btree_iter_down(struct btree_iter *iter, struct bpos pos, if (unlikely(IS_ERR(b))) return PTR_ERR(b); + btree_iter_verify_locking(iter, iter->level - 1); + --iter->level; btree_iter_node_set(iter, b, pos); return 0; @@ -409,42 +471,6 @@ static void btree_iter_up(struct btree_iter *iter) btree_node_unlock(iter, iter->level++); } -static void btree_iter_verify_locking(struct btree_iter *iter) -{ -#ifdef CONFIG_BCACHE_DEBUG - struct btree_iter *linked; - unsigned level; - - /* - * Can't hold _any_ read locks (including in linked iterators) when - * taking intent locks, that leads to a fun deadlock involving write - * locks and journal reservations - * - * We could conceivably drop read locks, then retake them and if - * retaking fails then return -EINTR... but, let's keep things simple - * for now: - */ - for_each_linked_btree_iter(iter, linked) - for (level = 0; level < BTREE_MAX_DEPTH; level++) - BUG_ON(btree_node_read_locked(linked, level)); - - /* Lock ordering: */ - for_each_linked_btree_iter(iter, linked) - BUG_ON(btree_iter_cmp(iter, linked) < 0 && - btree_node_intent_locked(linked, 0)); - - /* - * Also, we have to take intent locks on interior nodes before leaf - * nodes - verify that linked iterators don't have intent locks held at - * depths lower than where we're at: - */ - for_each_linked_btree_iter(iter, linked) - BUG_ON(btree_iter_cmp(linked, iter) <= 0 && - linked->locks_want > 0 && - linked->locks_want < iter->locks_want); -#endif -} - /* * This is the main state machine for walking down the btree - walks down to a * specified depth @@ -483,9 +509,6 @@ retry: __btree_iter_advance(iter); } - if (iter->locks_want > 0) - btree_iter_verify_locking(iter); - /* * Note: iter->nodes[iter->level] may be temporarily NULL here - that * would indicate to other code that we got to the end of the btree, @@ -756,6 +779,8 @@ void bch_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) memcpy(dst->node_iters, src->node_iters, sizeof(src->node_iters)); + + bch_btree_iter_upgrade(dst); } void bch_btree_iter_init_copy(struct btree_iter *dst, struct btree_iter *src) @@ -765,4 +790,6 @@ void bch_btree_iter_init_copy(struct btree_iter *dst, struct btree_iter *src) dst->nodes_locked = 0; dst->nodes_intent_locked = 0; bch_btree_iter_link(src, dst); + + bch_btree_iter_upgrade(dst); } diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c index 025a29d96138..f90bbe261792 100644 --- a/drivers/md/bcache/btree_update.c +++ b/drivers/md/bcache/btree_update.c @@ -1484,8 +1484,7 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags, } reserve = bch_btree_reserve_get(c, b, 0, - !(flags & BTREE_INSERT_NOFAIL), - cl); + !(flags & BTREE_INSERT_NOFAIL), cl); if (IS_ERR(reserve)) { ret = PTR_ERR(reserve); goto out_get_locks; @@ -1495,15 +1494,29 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags, btree_split(b, iter, NULL, reserve, as); bch_btree_reserve_put(c, reserve); + iter->locks_want = 1; + + for_each_linked_btree_iter(iter, linked) + if (linked->btree_id == iter->btree_id && + btree_iter_cmp(linked, iter) <= 0) + linked->locks_want = 1; out: up_read(&c->gc_lock); return ret; out_get_locks: /* Lock ordering... */ for_each_linked_btree_iter(iter, linked) - if (btree_iter_cmp(linked, iter) <= 0) + if (linked->btree_id == iter->btree_id && + btree_iter_cmp(linked, iter) <= 0) { + unsigned i; + + for (i = 0; i < BTREE_MAX_DEPTH; i++) { + btree_node_unlock(linked, i); + linked->lock_seq[i]--; + } linked->locks_want = U8_MAX; + } goto out; } @@ -1679,10 +1692,6 @@ retry: trans_for_each_entry(trans, i) if (!same_leaf_as_prev(trans, i)) bch_btree_node_write_lazy(i->iter->nodes[0], i->iter); - - trans_for_each_entry(trans, i) - i->iter->locks_want = 1; - out: percpu_ref_put(&c->writes); return ret; diff --git a/drivers/md/bcache/dirent.c b/drivers/md/bcache/dirent.c index f106513928ca..25e70cc9e7e5 100644 --- a/drivers/md/bcache/dirent.c +++ b/drivers/md/bcache/dirent.c @@ -221,16 +221,6 @@ int bch_dirent_rename(struct cache_set *c, do { /* - * When taking intent locks, we have to take interior node locks - * before leaf node locks; if the second iter we traverse has - * locks_want > the first iter, we could end up taking an intent - * lock on an interior node after traversing the first iterator - * only took an intent lock on a leaf. - */ - src_iter.locks_want = dst_iter.locks_want = - max(src_iter.locks_want, dst_iter.locks_want); - - /* * Have to traverse lower btree nodes before higher - due to * lock ordering. */ diff --git a/drivers/md/bcache/str_hash.h b/drivers/md/bcache/str_hash.h index 6bdd9f26c4e6..d296e9e92894 100644 --- a/drivers/md/bcache/str_hash.h +++ b/drivers/md/bcache/str_hash.h @@ -256,7 +256,7 @@ static inline int bch_hash_set(const struct bch_hash_desc desc, bch_btree_iter_init_intent(&hashed_slot, c, desc.btree_id, POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)))); - bch_btree_iter_init_intent(&iter, c, 0, POS_MIN); + bch_btree_iter_init_intent(&iter, c, desc.btree_id, hashed_slot.pos); bch_btree_iter_link(&hashed_slot, &iter); do { @@ -301,12 +301,16 @@ static inline int bch_hash_set(const struct bch_hash_desc desc, insert->k.p = iter.pos; ret = bch_btree_insert_at(&iter, insert, NULL, NULL, journal_seq, BTREE_INSERT_ATOMIC); - /* On successful insert, we don't want to clobber ret with error - * from iter: - */ + + /* unlock before traversing hashed_slot: */ bch_btree_iter_unlock(&iter); } while (ret == -EINTR); + /* + * On successful insert, we don't want to clobber ret with error from + * iter: + */ + bch_btree_iter_unlock(&iter); bch_btree_iter_unlock(&hashed_slot); return ret; err: |