diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-11-03 23:18:09 -0800 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2016-11-15 07:59:54 -0900 |
commit | d06cf260c3ca1d8f4ef93fe2c7db4bf046889943 (patch) | |
tree | 49cd7b6c945bef766f993f19f0b37b37b0b48308 | |
parent | e1bd0fffe23afca44e657751e22c32a94689ba00 (diff) |
bcache: Better locking for linked iterators
Linked iterators are now no longer requried to be traversed in key
order; instead, they'll return -EINTR if the caller needs to restart,
because they had to drop/reacquire locks to avoid deadlocking.
-rw-r--r-- | drivers/md/bcache/btree_cache.c | 85 | ||||
-rw-r--r-- | drivers/md/bcache/btree_cache.h | 7 | ||||
-rw-r--r-- | drivers/md/bcache/btree_io.c | 12 | ||||
-rw-r--r-- | drivers/md/bcache/btree_iter.c | 476 | ||||
-rw-r--r-- | drivers/md/bcache/btree_iter.h | 53 | ||||
-rw-r--r-- | drivers/md/bcache/btree_locking.h | 59 | ||||
-rw-r--r-- | drivers/md/bcache/btree_types.h | 6 | ||||
-rw-r--r-- | drivers/md/bcache/btree_update.c | 108 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 6 | ||||
-rw-r--r-- | drivers/md/bcache/dirent.c | 225 | ||||
-rw-r--r-- | drivers/md/bcache/fs-gc.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/fs-io.c | 45 | ||||
-rw-r--r-- | drivers/md/bcache/fs.c | 8 | ||||
-rw-r--r-- | drivers/md/bcache/inode.c | 14 | ||||
-rw-r--r-- | drivers/md/bcache/migrate.c | 23 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 3 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/str_hash.h | 192 | ||||
-rw-r--r-- | drivers/md/bcache/tier.c | 3 | ||||
-rw-r--r-- | include/trace/events/bcache.h | 60 | ||||
-rw-r--r-- | include/uapi/linux/bcache.h | 2 |
21 files changed, 740 insertions, 653 deletions
diff --git a/drivers/md/bcache/btree_cache.c b/drivers/md/bcache/btree_cache.c index 22dae330acdd..a997327eb37d 100644 --- a/drivers/md/bcache/btree_cache.c +++ b/drivers/md/bcache/btree_cache.c @@ -448,8 +448,8 @@ int mca_cannibalize_lock(struct cache_set *c, struct closure *cl) goto success; if (!cl) { - trace_bcache_mca_cannibalize_lock_fail(c, cl); - return -EINTR; + trace_bcache_mca_cannibalize_lock_fail(c); + return -ENOMEM; } closure_wait(&c->mca_wait, cl); @@ -462,11 +462,11 @@ int mca_cannibalize_lock(struct cache_set *c, struct closure *cl) goto success; } - trace_bcache_mca_cannibalize_lock_fail(c, cl); + trace_bcache_mca_cannibalize_lock_fail(c); return -EAGAIN; success: - trace_bcache_mca_cannibalize_lock(c, cl); + trace_bcache_mca_cannibalize_lock(c); return 0; } @@ -492,10 +492,9 @@ static struct btree *mca_cannibalize(struct cache_set *c) } } -struct btree *mca_alloc(struct cache_set *c, struct closure *cl) +struct btree *mca_alloc(struct cache_set *c) { struct btree *b; - int ret; u64 start_time = local_clock(); mutex_lock(&c->btree_cache_lock); @@ -546,37 +545,34 @@ out: return b; err: /* Try to cannibalize another cached btree node: */ - ret = mca_cannibalize_lock(c, cl); - if (ret) { + if (c->btree_cache_alloc_lock == current) { + b = mca_cannibalize(c); + list_del_init(&b->list); mutex_unlock(&c->btree_cache_lock); - return ERR_PTR(ret); - } - b = mca_cannibalize(c); - list_del_init(&b->list); - mutex_unlock(&c->btree_cache_lock); + mca_hash_remove(c, b); - mca_hash_remove(c, b); + trace_bcache_mca_cannibalize(c); + goto out; + } - trace_bcache_mca_cannibalize(c, cl); - goto out; + mutex_unlock(&c->btree_cache_lock); + return ERR_PTR(-ENOMEM); } /* Slowpath, don't want it inlined into btree_iter_traverse() */ static noinline struct btree *bch_btree_node_fill(struct btree_iter *iter, const struct bkey_i *k, unsigned level, - struct closure *cl) + enum six_lock_type lock_type) { struct cache_set *c = iter->c; struct btree *b; - b = mca_alloc(c, cl); + b = mca_alloc(c); if (IS_ERR(b)) return b; - mca_cannibalize_unlock(c); - bkey_copy(&b->key, k); if (mca_hash_insert(c, b, level, iter->btree_id)) { /* raced with another fill: */ @@ -608,9 +604,7 @@ static noinline struct btree *bch_btree_node_fill(struct btree_iter *iter, bch_btree_node_read(c, b); six_unlock_write(&b->lock); - mark_btree_node_locked(iter, level, btree_lock_want(iter, level)); - - if (btree_lock_want(iter, level) == SIX_LOCK_read) + if (lock_type == SIX_LOCK_read) BUG_ON(!six_trylock_convert(&b->lock, SIX_LOCK_intent, SIX_LOCK_read)); @@ -629,7 +623,7 @@ static noinline struct btree *bch_btree_node_fill(struct btree_iter *iter, */ struct btree *bch_btree_node_get(struct btree_iter *iter, const struct bkey_i *k, unsigned level, - struct closure *cl) + enum six_lock_type lock_type) { int i = 0; struct btree *b; @@ -646,16 +640,14 @@ retry: * else we could read in a btree node from disk that's been * freed: */ - b = bch_btree_node_fill(iter, k, level, cl); + b = bch_btree_node_fill(iter, k, level, lock_type); /* We raced and found the btree node in the cache */ if (!b) goto retry; - if (IS_ERR(b)) { - BUG_ON(PTR_ERR(b) != -EAGAIN); + if (IS_ERR(b)) return b; - } } else { /* * There's a potential deadlock with splits and insertions into @@ -678,8 +670,8 @@ retry: * free it: * * To guard against this, btree nodes are evicted from the cache - * when they're freed - and PTR_HASH() is zeroed out, which will - * cause our btree_node_lock() call below to fail. + * when they're freed - and PTR_HASH() is zeroed out, which we + * check for after we lock the node. * * Then, btree_node_relock() on the parent will fail - because * the parent was modified, when the pointer to the node we want @@ -688,34 +680,35 @@ retry: if (btree_node_read_locked(iter, level + 1)) btree_node_unlock(iter, level + 1); - if (!btree_node_lock(b, iter, level, - PTR_HASH(&b->key) != PTR_HASH(k))) { - if (!btree_node_relock(iter, level + 1)) { - trace_bcache_btree_intent_lock_fail(b, iter); - return ERR_PTR(-EINTR); - } + if (!btree_node_lock(b, k->k.p, level, iter, lock_type)) + return ERR_PTR(-EINTR); - goto retry; - } + if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) || + b->level != level || + race_fault())) { + six_unlock_type(&b->lock, lock_type); + if (btree_node_relock(iter, level + 1)) + goto retry; - BUG_ON(b->level != level); + return ERR_PTR(-EINTR); + } } - /* avoid atomic set bit if it's not needed: */ - if (btree_node_accessed(b)) - set_btree_node_accessed(b); - for (; i <= b->keys.nsets; i++) { prefetch(b->keys.set[i].tree); prefetch(b->keys.set[i].data); } - if (btree_node_read_error(b)) { - __btree_node_unlock(iter, level, b); + /* avoid atomic set bit if it's not needed: */ + if (btree_node_accessed(b)) + set_btree_node_accessed(b); + + if (unlikely(btree_node_read_error(b))) { + six_unlock_type(&b->lock, lock_type); return ERR_PTR(-EIO); } - BUG_ON(!b->written); + EBUG_ON(!b->written); return b; } diff --git a/drivers/md/bcache/btree_cache.h b/drivers/md/bcache/btree_cache.h index e3950bf4cfb3..5beec6e4e942 100644 --- a/drivers/md/bcache/btree_cache.h +++ b/drivers/md/bcache/btree_cache.h @@ -17,11 +17,10 @@ int mca_hash_insert(struct cache_set *, struct btree *, void mca_cannibalize_unlock(struct cache_set *); int mca_cannibalize_lock(struct cache_set *, struct closure *); -struct btree *mca_alloc(struct cache_set *, struct closure *); +struct btree *mca_alloc(struct cache_set *); -struct btree *bch_btree_node_get(struct btree_iter *, - const struct bkey_i *, unsigned, - struct closure *); +struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *, + unsigned, enum six_lock_type); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); diff --git a/drivers/md/bcache/btree_io.c b/drivers/md/bcache/btree_io.c index c343811d9bc9..1033232307f4 100644 --- a/drivers/md/bcache/btree_io.c +++ b/drivers/md/bcache/btree_io.c @@ -455,13 +455,19 @@ int bch_btree_root_read(struct cache_set *c, enum btree_id id, { struct closure cl; struct btree *b; + int ret; closure_init_stack(&cl); - while (IS_ERR(b = mca_alloc(c, &cl))) { - BUG_ON(PTR_ERR(b) != -EAGAIN); + do { + ret = mca_cannibalize_lock(c, &cl); closure_sync(&cl); - } + } while (ret); + + b = mca_alloc(c); + mca_cannibalize_unlock(c); + + BUG_ON(IS_ERR(b)); bkey_copy(&b->key, k); BUG_ON(mca_hash_insert(c, b, level, id)); diff --git a/drivers/md/bcache/btree_iter.c b/drivers/md/bcache/btree_iter.c index f0e729b7f6eb..d6ab2794254b 100644 --- a/drivers/md/bcache/btree_iter.c +++ b/drivers/md/bcache/btree_iter.c @@ -124,29 +124,164 @@ success: return true; } +/* Slowpath: */ +bool __bch_btree_node_lock(struct btree *b, struct bpos pos, + unsigned level, + struct btree_iter *iter, + enum six_lock_type type) +{ + struct btree_iter *linked; + + /* Can't have children locked before ancestors: */ + EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked)); + + /* + * Can't hold any read locks while we block taking an intent lock - see + * below for reasoning, and we should have already dropped any read + * locks in the current iterator + */ + EBUG_ON(type == SIX_LOCK_intent && + iter->nodes_locked != iter->nodes_intent_locked); + + for_each_linked_btree_iter(iter, linked) + if (linked->nodes[level] == b && + btree_node_locked_type(linked, level) == type) { + six_lock_increment(&b->lock, type); + return true; + } + + /* + * Must lock btree nodes in key order - this case hapens when locking + * the prev sibling in btree node merging: + */ + if (iter->nodes_locked && + __ffs(iter->nodes_locked) == level && + __btree_iter_cmp(iter->btree_id, pos, iter)) + return false; + + for_each_linked_btree_iter(iter, linked) { + if (!linked->nodes_locked) + continue; + + /* + * Can't block taking an intent lock if we have _any_ nodes read + * locked: + * + * - Our read lock blocks another thread with an intent lock on + * the same node from getting a write lock, and thus from + * dropping its intent lock + * + * - And the other thread may have multiple nodes intent locked: + * both the node we want to intent lock, and the node we + * already have read locked - deadlock: + */ + if (type == SIX_LOCK_intent && + linked->nodes_locked != linked->nodes_intent_locked) { + linked->locks_want = max(linked->locks_want, + iter->locks_want); + return false; + } + + /* We have to lock btree nodes in key order: */ + if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) + return false; + + /* + * Interior nodes must be locked before their descendants: if + * another iterator has possible descendants locked of the node + * we're about to lock, it must have the ancestors locked too: + */ + if (linked->btree_id == iter->btree_id && + level > __fls(linked->nodes_locked)) { + linked->locks_want = max(linked->locks_want, + iter->locks_want); + return false; + } + } + + six_lock_type(&b->lock, type); + return true; +} + /* Btree iterator locking: */ -bool bch_btree_iter_upgrade(struct btree_iter *iter) + +static void btree_iter_drop_extra_locks(struct btree_iter *iter) { - int i; + unsigned l; + + while (iter->nodes_locked && + (l = __fls(iter->nodes_locked)) > iter->locks_want) { + if (!btree_node_locked(iter, l)) + panic("l %u nodes_locked %u\n", l, iter->nodes_locked); + + if (l > iter->level) { + btree_node_unlock(iter, l); + } else if (btree_node_intent_locked(iter, l)) { + BUG_ON(!six_trylock_convert(&iter->nodes[l]->lock, + SIX_LOCK_intent, + SIX_LOCK_read)); + iter->nodes_intent_locked ^= 1 << l; + } + } +} - for (i = iter->level; i < iter->locks_want && iter->nodes[i]; i++) - if (!btree_node_relock(iter, i)) - return false; +bool __bch_btree_iter_set_locks_want(struct btree_iter *iter, + unsigned new_locks_want) +{ + struct btree_iter *linked; + unsigned l; + + /* Drop locks we don't want anymore: */ + if (new_locks_want < iter->locks_want) + for_each_linked_btree_iter(iter, linked) + if (linked->locks_want > new_locks_want) { + linked->locks_want = max_t(unsigned, 1, + new_locks_want); + btree_iter_drop_extra_locks(linked); + } + + iter->locks_want = new_locks_want; + btree_iter_drop_extra_locks(iter); + + for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++) + if (!btree_node_relock(iter, l)) + goto fail; return true; +fail: + /* + * Just an optimization: ancestor nodes must be locked before child + * nodes, so set locks_want on iterators that might lock ancestors + * before us to avoid getting -EINTR later: + */ + for_each_linked_btree_iter(iter, linked) + if (linked->btree_id == iter->btree_id && + btree_iter_cmp(linked, iter) <= 0) + linked->locks_want = max_t(unsigned, linked->locks_want, + new_locks_want); + return false; } -int bch_btree_iter_unlock(struct btree_iter *iter) +static int __bch_btree_iter_unlock(struct btree_iter *iter) { - unsigned l = 0; + BUG_ON(iter->error == -EINTR); while (iter->nodes_locked) - btree_node_unlock(iter, l++); + btree_node_unlock(iter, __ffs(iter->nodes_locked)); return iter->error; } +int bch_btree_iter_unlock(struct btree_iter *iter) +{ + struct btree_iter *linked; + + for_each_linked_btree_iter(iter, linked) + __bch_btree_iter_unlock(linked); + return __bch_btree_iter_unlock(iter); +} + /* Btree iterator: */ #ifdef CONFIG_BCACHE_DEBUG @@ -283,6 +418,8 @@ static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter) &iter->nodes[iter->level]->keys); struct bkey_s_c ret; + EBUG_ON(!btree_node_locked(iter, iter->level)); + if (!k) return bkey_s_c_null; @@ -302,6 +439,8 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter) &iter->nodes[iter->level]->keys); struct bkey_s_c ret; + EBUG_ON(!btree_node_locked(iter, iter->level)); + if (!k) return bkey_s_c_null; @@ -320,15 +459,14 @@ static inline void __btree_iter_advance(struct btree_iter *iter) } static inline void btree_iter_node_set(struct btree_iter *iter, - struct btree *b, - struct bpos pos) + struct btree *b) { BUG_ON(b->lock.state.seq & 1); iter->lock_seq[b->level] = b->lock.state.seq; iter->nodes[b->level] = b; bch_btree_node_iter_init(&iter->node_iters[b->level], &b->keys, - pos, iter->is_extents); + iter->pos, iter->is_extents); } static bool btree_iter_pos_in_node(struct btree_iter *iter, struct btree *b) @@ -371,7 +509,7 @@ bool bch_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) mark_btree_node_intent_locked(linked, b->level); } - btree_iter_node_set(linked, b, linked->pos); + btree_iter_node_set(linked, b); } if (!btree_iter_pos_in_node(iter, b)) { @@ -380,7 +518,7 @@ bool bch_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) } mark_btree_node_intent_locked(iter, b->level); - btree_iter_node_set(iter, b, iter->pos); + btree_iter_node_set(iter, b); return true; } @@ -425,86 +563,71 @@ void bch_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) iter->pos, iter->is_extents); } -static void btree_iter_verify_locking(struct btree_iter *iter, unsigned level) -{ -#ifdef CONFIG_BCACHE_DEBUG - struct btree_iter *linked; - - if (!btree_want_intent(iter, level)) - return; - - /* - * Can't hold _any_ read locks (including in linked iterators) when - * taking intent locks, that leads to a fun deadlock involving write - * locks and journal reservations - * - * We could conceivably drop read locks, then retake them and if - * retaking fails then return -EINTR... but, let's keep things simple - * for now: - */ - BUG_ON(iter->nodes_locked != iter->nodes_intent_locked); - - for_each_linked_btree_iter(iter, linked) - BUG_ON(linked->nodes_locked != linked->nodes_intent_locked); - - /* Lock ordering: */ - for_each_linked_btree_iter(iter, linked) { - BUG_ON(linked->nodes_locked && - btree_iter_cmp(linked, iter) > 0); - - BUG_ON(linked->nodes_locked && - linked->btree_id == iter->btree_id && - level > __fls(linked->nodes_locked)); - } -#endif -} - -static inline void btree_iter_lock_root(struct btree_iter *iter, struct bpos pos, - unsigned l) +static inline int btree_iter_lock_root(struct btree_iter *iter, + unsigned depth_want) { struct cache_set *c = iter->c; + struct btree *b; + enum six_lock_type lock_type; + unsigned i; - iter->nodes_locked = 0; - iter->nodes_intent_locked = 0; - memset(iter->nodes, 0, sizeof(iter->nodes)); + EBUG_ON(iter->nodes_locked); while (1) { - struct btree *b = READ_ONCE(c->btree_roots[iter->btree_id].b); - + b = READ_ONCE(c->btree_roots[iter->btree_id].b); iter->level = READ_ONCE(b->level); - if (iter->level < l) { - iter->level = l; - break; + if (iter->level < depth_want) { + /* + * the root is at a lower depth than the depth we want: + * got to the end of the btree, or we're walking nodes + * greater than some depth and there are no nodes >= + * that depth + */ + iter->level = depth_want; + iter->nodes[iter->level] = NULL; + return 0; } - btree_iter_verify_locking(iter, iter->level); + lock_type = btree_lock_want(iter, iter->level); + if (!btree_node_lock(b, POS_MAX, iter->level, + iter, lock_type)) + return -EINTR; + + if (b == c->btree_roots[iter->btree_id].b && + b->level == iter->level && + !race_fault()) { + for (i = 0; i < iter->level; i++) + iter->nodes[i] = BTREE_ITER_NOT_END; + iter->nodes[iter->level] = b; + + mark_btree_node_locked(iter, iter->level, lock_type); + btree_iter_node_set(iter, b); + return 0; - if (btree_node_lock(b, iter, iter->level, - (b != c->btree_roots[iter->btree_id].b))) { - btree_iter_node_set(iter, b, pos); - break; } + + six_unlock_type(&b->lock, lock_type); } } -static inline int btree_iter_down(struct btree_iter *iter, struct bpos pos, - struct closure *cl) +static inline int btree_iter_down(struct btree_iter *iter) { struct btree *b; struct bkey_s_c k = __btree_iter_peek(iter); + unsigned level = iter->level - 1; + enum six_lock_type lock_type = btree_lock_want(iter, level); BKEY_PADDED(k) tmp; bkey_reassemble(&tmp.k, k); - b = bch_btree_node_get(iter, &tmp.k, iter->level - 1, cl); + b = bch_btree_node_get(iter, &tmp.k, level, lock_type); if (unlikely(IS_ERR(b))) return PTR_ERR(b); - btree_iter_verify_locking(iter, iter->level - 1); - - --iter->level; - btree_iter_node_set(iter, b, pos); + iter->level = level; + mark_btree_node_locked(iter, level, lock_type); + btree_iter_node_set(iter, b); return 0; } @@ -513,6 +636,84 @@ static void btree_iter_up(struct btree_iter *iter) btree_node_unlock(iter, iter->level++); } +int __must_check __bch_btree_iter_traverse(struct btree_iter *); + +static int btree_iter_traverse_error(struct btree_iter *iter, int ret) +{ + struct cache_set *c = iter->c; + struct btree_iter *linked, *sorted_iters, **i; +retry_all: + for_each_linked_btree_iter(iter, linked) + bch_btree_iter_unlock(linked); + bch_btree_iter_unlock(iter); + + if (ret != -ENOMEM && ret != -EINTR) + goto io_error; + + if (ret == -ENOMEM) { + struct closure cl; + + closure_init_stack(&cl); + + do { + ret = mca_cannibalize_lock(c, &cl); + closure_sync(&cl); + } while (ret); + } + + /* + * Linked iters are normally a circular singly linked list - break cycle + * while we sort them: + */ + linked = iter->next; + iter->next = NULL; + sorted_iters = NULL; + + while (linked) { + iter = linked; + linked = linked->next; + + i = &sorted_iters; + while (*i && btree_iter_cmp(iter, *i) > 0) + i = &(*i)->next; + + iter->next = *i; + *i = iter; + } + + /* Make list circular again: */ + iter = sorted_iters; + while (iter->next) + iter = iter->next; + iter->next = sorted_iters; + + /* Now, redo traversals in correct order: */ + + iter = sorted_iters; + do { +retry: + ret = __bch_btree_iter_traverse(iter); + if (unlikely(ret)) { + if (ret == -EINTR) + goto retry; + goto retry_all; + } + + iter = iter->next; + } while (iter != sorted_iters); + + ret = btree_iter_linked(iter) ? -EINTR : 0; +out: + mca_cannibalize_unlock(c); + return ret; +io_error: + BUG_ON(ret != -EIO); + + iter->error = ret; + iter->nodes[iter->level] = NULL; + goto out; +} + /* * This is the main state machine for walking down the btree - walks down to a * specified depth @@ -522,14 +723,10 @@ static void btree_iter_up(struct btree_iter *iter) * On error, caller (peek_node()/peek_key()) must return NULL; the error is * stashed in the iterator and returned from bch_btree_iter_unlock(). */ -static int __must_check __bch_btree_iter_traverse(struct btree_iter *iter, - unsigned l, struct bpos pos) +int __must_check __bch_btree_iter_traverse(struct btree_iter *iter) { - if (!iter->nodes[iter->level]) - return 0; + unsigned depth_want = iter->level; - iter->at_end_of_leaf = false; -retry: /* make sure we have all the intent locks we need - ugh */ if (unlikely(iter->nodes[iter->level] && iter->level + 1 < iter->locks_want)) { @@ -566,7 +763,7 @@ retry: struct bkey_s_c k; while ((k = __btree_iter_peek_all(iter)).k && - !btree_iter_pos_cmp(pos, k.k, iter->is_extents)) + !btree_iter_pos_cmp(iter->pos, k.k, iter->is_extents)) __btree_iter_advance(iter); } @@ -576,35 +773,33 @@ retry: * here it indicates that relocking the root failed - it's critical that * btree_iter_lock_root() comes next and that it can't fail */ - while (iter->level > l) - if (iter->nodes[iter->level]) { - struct closure cl; - int ret; - - closure_init_stack(&cl); - - ret = btree_iter_down(iter, pos, &cl); - if (unlikely(ret)) { - bch_btree_iter_unlock(iter); - closure_sync(&cl); - - if (ret == -EAGAIN || ret == -EINTR) - goto retry; - - iter->error = ret; - iter->level = BTREE_MAX_DEPTH; - return ret; - } - } else { - btree_iter_lock_root(iter, pos, l); + while (iter->level > depth_want) { + int ret = iter->nodes[iter->level] + ? btree_iter_down(iter) + : btree_iter_lock_root(iter, depth_want); + if (unlikely(ret)) { + iter->level = depth_want; + return ret; } + } return 0; } int __must_check bch_btree_iter_traverse(struct btree_iter *iter) { - return __bch_btree_iter_traverse(iter, iter->level, iter->pos); + int ret; + + if (unlikely(!iter->nodes[iter->level])) + return 0; + + iter->at_end_of_leaf = false; + + ret = __bch_btree_iter_traverse(iter); + if (unlikely(ret)) + ret = btree_iter_traverse_error(iter, ret); + + return ret; } /* Iterate across nodes (leaf and interior nodes) */ @@ -648,11 +843,15 @@ struct btree *bch_btree_iter_next_node(struct btree_iter *iter, unsigned depth) return NULL; b = iter->nodes[iter->level]; + if (!b) + return b; if (bkey_cmp(iter->pos, b->key.k.p) < 0) { - struct bpos pos = bkey_successor(iter->pos); + /* Haven't gotten to the end of the parent node: */ + iter->pos = bkey_successor(iter->pos); + iter->level = depth; - ret = __bch_btree_iter_traverse(iter, depth, pos); + ret = bch_btree_iter_traverse(iter); if (ret) return NULL; @@ -672,6 +871,7 @@ void bch_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_p struct btree_node_iter *node_iter = &iter->node_iters[0]; struct bkey_packed *k; + EBUG_ON(iter->level != 0); EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); EBUG_ON(!btree_node_locked(iter, 0)); EBUG_ON(bkey_cmp(new_pos, iter->nodes[0]->key.k.p) > 0); @@ -691,7 +891,7 @@ void bch_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_p void bch_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { - EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); + EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */ iter->pos = new_pos; } @@ -722,13 +922,14 @@ void bch_btree_iter_rewind(struct btree_iter *iter, struct bpos pos) struct bkey_s_c bch_btree_iter_peek(struct btree_iter *iter) { struct bkey_s_c k; - struct bpos pos = iter->pos; int ret; while (1) { - ret = __bch_btree_iter_traverse(iter, 0, pos); - if (unlikely(ret)) - break; + ret = bch_btree_iter_traverse(iter); + if (unlikely(ret)) { + iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); + return bkey_s_c_err(ret); + } k = __btree_iter_peek(iter); if (likely(k.k)) { @@ -742,16 +943,16 @@ struct bkey_s_c bch_btree_iter_peek(struct btree_iter *iter) return k; } - pos = iter->nodes[0]->key.k.p; + iter->pos = iter->nodes[0]->key.k.p; - if (!bkey_cmp(pos, POS_MAX)) - break; + if (!bkey_cmp(iter->pos, POS_MAX)) { + iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); + bch_btree_iter_unlock(iter); + return bkey_s_c_null; + } - pos = btree_type_successor(iter->btree_id, pos); + iter->pos = btree_type_successor(iter->btree_id, iter->pos); } - - iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); - return bkey_s_c_null; } struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *iter) @@ -761,10 +962,10 @@ struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *iter) int ret; while (1) { - ret = __bch_btree_iter_traverse(iter, 0, iter->pos); + ret = bch_btree_iter_traverse(iter); if (unlikely(ret)) { iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); - return bkey_s_c_null; + return bkey_s_c_err(ret); } k = __btree_iter_peek_all(iter); @@ -805,70 +1006,43 @@ recheck: void __bch_btree_iter_init(struct btree_iter *iter, struct cache_set *c, enum btree_id btree_id, struct bpos pos, - int locks_want, unsigned depth) + unsigned locks_want, unsigned depth) { iter->level = depth; iter->is_extents = btree_id == BTREE_ID_EXTENTS; iter->nodes_locked = 0; iter->nodes_intent_locked = 0; - iter->locks_want = locks_want; + iter->locks_want = min(locks_want, BTREE_MAX_DEPTH); iter->btree_id = btree_id; iter->at_end_of_leaf = 0; iter->error = 0; iter->c = c; iter->pos = pos; + memset(iter->nodes, 0, sizeof(iter->nodes)); iter->nodes[iter->level] = BTREE_ITER_NOT_END; - iter->nodes[iter->level + 1] = NULL; iter->next = iter; } -int bch_btree_iter_unlink(struct btree_iter *iter) +void bch_btree_iter_link(struct btree_iter *iter, struct btree_iter *new) { - struct btree_iter *linked; - int ret = bch_btree_iter_unlock(iter); + BUG_ON(btree_iter_linked(new)); - for_each_linked_btree_iter(iter, linked) - if (linked->next == iter) { - linked->next = iter->next; - iter->next = iter; - return ret; - } - - BUG(); -} - -void bch_btree_iter_link(struct btree_iter *iter, struct btree_iter *linked) -{ - BUG_ON(linked->next != linked); - - linked->next = iter->next; - iter->next = linked; + new->next = iter->next; + iter->next = new; if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) { unsigned nr_iters = 1; - for_each_linked_btree_iter(iter, linked) + for_each_linked_btree_iter(iter, new) nr_iters++; BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE); } } -static void __bch_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) -{ - memcpy(dst, src, offsetof(struct btree_iter, next)); - dst->nodes_locked = dst->nodes_intent_locked = 0; -} - void bch_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) { bch_btree_iter_unlock(dst); - __bch_btree_iter_copy(dst, src); -} - -void bch_btree_iter_init_copy(struct btree_iter *dst, struct btree_iter *src) -{ - __bch_btree_iter_copy(dst, src); - dst->next = dst; - bch_btree_iter_link(src, dst); + memcpy(dst, src, offsetof(struct btree_iter, next)); + dst->nodes_locked = dst->nodes_intent_locked = 0; } diff --git a/drivers/md/bcache/btree_iter.h b/drivers/md/bcache/btree_iter.h index 694be63f9f1c..98353341d286 100644 --- a/drivers/md/bcache/btree_iter.h +++ b/drivers/md/bcache/btree_iter.h @@ -66,9 +66,16 @@ struct btree_iter { * locked, or read and write locked, at the same time), and insertions * through one iterator won't invalidate the other linked iterators. */ + + /* Must come last: */ struct btree_iter *next; }; +static inline bool btree_iter_linked(const struct btree_iter *iter) +{ + return iter->next != iter; +} + /** * for_each_linked_btree_iter - iterate over all iterators linked with @_iter */ @@ -125,8 +132,20 @@ void bch_btree_node_iter_fix(struct btree_iter *, struct btree *, struct btree_node_iter *, struct bset_tree *, struct bkey_packed *, unsigned, unsigned); -bool bch_btree_iter_upgrade(struct btree_iter *); int bch_btree_iter_unlock(struct btree_iter *); +bool __bch_btree_iter_set_locks_want(struct btree_iter *, unsigned); + +static inline bool bch_btree_iter_set_locks_want(struct btree_iter *iter, + unsigned new_locks_want) +{ + new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); + + if (iter->locks_want == new_locks_want && + iter->nodes_intent_locked == (1 << new_locks_want) - 1) + return true; + + return __bch_btree_iter_set_locks_want(iter, new_locks_want); +} bool bch_btree_iter_node_replace(struct btree_iter *, struct btree *); void bch_btree_iter_node_drop_linked(struct btree_iter *, struct btree *); @@ -147,7 +166,7 @@ void bch_btree_iter_advance_pos(struct btree_iter *); void bch_btree_iter_rewind(struct btree_iter *, struct bpos); void __bch_btree_iter_init(struct btree_iter *, struct cache_set *, - enum btree_id, struct bpos, int, unsigned); + enum btree_id, struct bpos, unsigned , unsigned); static inline void bch_btree_iter_init(struct btree_iter *iter, struct cache_set *c, @@ -165,10 +184,8 @@ static inline void bch_btree_iter_init_intent(struct btree_iter *iter, __bch_btree_iter_init(iter, c, btree_id, pos, 1, 0); } -int bch_btree_iter_unlink(struct btree_iter *); void bch_btree_iter_link(struct btree_iter *, struct btree_iter *); void bch_btree_iter_copy(struct btree_iter *, struct btree_iter *); -void bch_btree_iter_init_copy(struct btree_iter *, struct btree_iter *); static inline struct bpos btree_type_successor(enum btree_id id, struct bpos pos) @@ -183,12 +200,19 @@ static inline struct bpos btree_type_successor(enum btree_id id, return pos; } +static inline int __btree_iter_cmp(enum btree_id id, + struct bpos pos, + const struct btree_iter *r) +{ + if (id != r->btree_id) + return id < r->btree_id ? -1 : 1; + return bkey_cmp(pos, r->pos); +} + static inline int btree_iter_cmp(const struct btree_iter *l, const struct btree_iter *r) { - if (l->btree_id != r->btree_id) - return l->btree_id < r->btree_id ? -1 : 1; - return bkey_cmp(l->pos, r->pos); + return __btree_iter_cmp(l->btree_id, l->pos, r); } #define __for_each_btree_node(_iter, _c, _btree_id, _start, _depth, \ @@ -207,7 +231,7 @@ static inline int btree_iter_cmp(const struct btree_iter *l, _k, _locks_want) \ for (__bch_btree_iter_init((_iter), (_c), (_btree_id), \ _start, _locks_want, 0); \ - ((_k) = bch_btree_iter_peek(_iter)).k; \ + !IS_ERR_OR_NULL(((_k) = bch_btree_iter_peek(_iter)).k); \ bch_btree_iter_advance_pos(_iter)) #define for_each_btree_key(_iter, _c, _btree_id, _start, _k) \ @@ -220,7 +244,7 @@ static inline int btree_iter_cmp(const struct btree_iter *l, _start, _k, _locks_want) \ for (__bch_btree_iter_init((_iter), (_c), (_btree_id), \ _start, _locks_want, 0); \ - ((_k) = bch_btree_iter_peek_with_holes(_iter)).k; \ + !IS_ERR_OR_NULL(((_k) = bch_btree_iter_peek_with_holes(_iter)).k);\ bch_btree_iter_advance_pos(_iter)) #define for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k) \ @@ -230,16 +254,27 @@ static inline int btree_iter_cmp(const struct btree_iter *l, _start, _k) \ __for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 1) +static inline int btree_iter_err(struct bkey_s_c k) +{ + return IS_ERR(k.k) ? PTR_ERR(k.k) : 0; +} + /* * Unlocks before scheduling * Note: does not revalidate iterator */ static inline void bch_btree_iter_cond_resched(struct btree_iter *iter) { + struct btree_iter *linked; + if (need_resched()) { + for_each_linked_btree_iter(iter, linked) + bch_btree_iter_unlock(linked); bch_btree_iter_unlock(iter); schedule(); } else if (race_fault()) { + for_each_linked_btree_iter(iter, linked) + bch_btree_iter_unlock(linked); bch_btree_iter_unlock(iter); } } diff --git a/drivers/md/bcache/btree_locking.h b/drivers/md/bcache/btree_locking.h index fb6ce606eea4..ad0dc3a69b48 100644 --- a/drivers/md/bcache/btree_locking.h +++ b/drivers/md/bcache/btree_locking.h @@ -87,62 +87,27 @@ static inline bool btree_want_intent(struct btree_iter *iter, int level) return btree_lock_want(iter, level) == SIX_LOCK_intent; } -static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level, - struct btree *b) +static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) { - switch (btree_node_locked_type(iter, level)) { - case BTREE_NODE_READ_LOCKED: - six_unlock_read(&b->lock); - break; - case BTREE_NODE_INTENT_LOCKED: - six_unlock_intent(&b->lock); - break; - } + int lock_type = btree_node_locked_type(iter, level); + if (lock_type != BTREE_NODE_UNLOCKED) + six_unlock_type(&iter->nodes[level]->lock, lock_type); mark_btree_node_unlocked(iter, level); } -static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) -{ - __btree_node_unlock(iter, level, iter->nodes[level]); -} +bool __bch_btree_node_lock(struct btree *, struct bpos, unsigned, + struct btree_iter *, enum six_lock_type); -static inline void btree_node_lock_type(struct btree *b, struct btree_iter *iter, - enum six_lock_type type) +static inline bool btree_node_lock(struct btree *b, struct bpos pos, + unsigned level, + struct btree_iter *iter, + enum six_lock_type type) { - struct btree_iter *linked; - - if (six_trylock_type(&b->lock, type)) - return; - - for_each_linked_btree_iter(iter, linked) - if (linked->nodes[b->level] == b && - btree_node_locked_type(linked, b->level) == type) { - six_lock_increment(&b->lock, type); - return; - } - - six_lock_type(&b->lock, type); + return six_trylock_type(&b->lock, type) || + __bch_btree_node_lock(b, pos, level, iter, type); } -#define __btree_node_lock(b, _iter, _level, check_if_raced) \ -({ \ - enum six_lock_type _type = btree_lock_want(_iter, _level); \ - bool _raced; \ - \ - btree_node_lock_type(b, _iter, _type); \ - if ((_raced = ((check_if_raced) || ((b)->level != _level)))) \ - six_unlock_type(&(b)->lock, _type); \ - else \ - mark_btree_node_locked(_iter, _level, _type); \ - \ - !_raced; \ -}) - -#define btree_node_lock(b, iter, level, check_if_raced) \ - (!race_fault() && \ - __btree_node_lock(b, iter, level, check_if_raced)) - bool btree_node_relock(struct btree_iter *, unsigned); void btree_node_unlock_write(struct btree *, struct btree_iter *); diff --git a/drivers/md/bcache/btree_types.h b/drivers/md/bcache/btree_types.h index 9b6cac235ae6..d4bfd711ac65 100644 --- a/drivers/md/bcache/btree_types.h +++ b/drivers/md/bcache/btree_types.h @@ -46,6 +46,12 @@ struct btree { u16 written; u8 level; u8 btree_id; + /* + * XXX: add a delete sequence number, so when btree_node_relock() fails + * because the lock sequence number has changed - i.e. the contents were + * modified - we can still relock the node if it's still the one we + * want, without redoing the traversal + */ /* * For asynchronous splits/interior node updates: diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c index 7aef5cd9f9d9..abb94a4fcf79 100644 --- a/drivers/md/bcache/btree_update.c +++ b/drivers/md/bcache/btree_update.c @@ -251,7 +251,7 @@ retry: goto retry; } mem_alloc: - b = mca_alloc(c, NULL); + b = mca_alloc(c); /* we hold cannibalize_lock: */ BUG_ON(IS_ERR(b)); @@ -1439,7 +1439,6 @@ void bch_btree_insert_node(struct btree *b, static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags, struct closure *cl) { - struct btree_iter *linked; struct cache_set *c = iter->c; struct btree *b = iter->nodes[0]; struct btree_reserve *reserve; @@ -1456,18 +1455,16 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags, * XXX: figure out how far we might need to split, * instead of locking/reserving all the way to the root: */ - iter->locks_want = U8_MAX; - - if (!bch_btree_iter_upgrade(iter)) { + if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) { ret = -EINTR; - goto out_get_locks; + goto out; } reserve = bch_btree_reserve_get(c, b, 0, !(flags & BTREE_INSERT_NOFAIL), cl); if (IS_ERR(reserve)) { ret = PTR_ERR(reserve); - goto out_get_locks; + goto out; } as = bch_btree_interior_update_alloc(c); @@ -1475,29 +1472,10 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags, btree_split(b, iter, NULL, reserve, as); bch_btree_reserve_put(c, reserve); - iter->locks_want = 1; - - for_each_linked_btree_iter(iter, linked) - if (linked->btree_id == iter->btree_id && - btree_iter_cmp(linked, iter) <= 0) - linked->locks_want = 1; + bch_btree_iter_set_locks_want(iter, 1); out: up_read(&c->gc_lock); return ret; -out_get_locks: - /* Lock ordering... */ - for_each_linked_btree_iter(iter, linked) - if (linked->btree_id == iter->btree_id && - btree_iter_cmp(linked, iter) <= 0) { - unsigned i; - - for (i = 0; i < BTREE_MAX_DEPTH; i++) { - btree_node_unlock(linked, i); - linked->lock_seq[i]--; - } - linked->locks_want = U8_MAX; - } - goto out; } /** @@ -1606,14 +1584,11 @@ int __bch_btree_insert_at(struct btree_insert *trans, u64 *journal_seq) if (unlikely(!percpu_ref_tryget(&c->writes))) return -EROFS; - - trans_for_each_entry(trans, i) { - i->iter->locks_want = max_t(int, i->iter->locks_want, 1); - if (unlikely(!bch_btree_iter_upgrade(i->iter))) { - ret = -EINTR; +retry_locks: + ret = -EINTR; + trans_for_each_entry(trans, i) + if (!bch_btree_iter_set_locks_want(i->iter, 1)) goto err; - } - } retry: trans->did_work = false; u64s = 0; @@ -1694,6 +1669,11 @@ unlock: if (!same_leaf_as_prev(trans, i)) bch_btree_node_write_lazy(i->iter->nodes[0], i->iter); out: + /* make sure we didn't lose an error: */ + if (!ret && IS_ENABLED(CONFIG_BCACHE_DEBUG)) + trans_for_each_entry(trans, i) + BUG_ON(!i->done); + percpu_ref_put(&c->writes); return ret; split: @@ -1705,21 +1685,15 @@ split: ret = bch_btree_split_leaf(split, trans->flags, &cl); if (ret) goto err; - /* * if the split didn't have to drop locks the insert will still be * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked() * and is overwriting won't have changed) */ - goto retry; + goto retry_locks; err: if (ret == -EAGAIN) { - struct btree_iter *linked; - - for_each_linked_btree_iter(split, linked) - bch_btree_iter_unlock(linked); bch_btree_iter_unlock(split); - closure_sync(&cl); ret = -EINTR; } @@ -1729,28 +1703,23 @@ err: up_read(&c->gc_lock); } - /* - * Main rule is, BTREE_INSERT_ATOMIC means we can't call - * bch_btree_iter_traverse(), because if we have to we either dropped - * locks or we need a different btree node (different than the one the - * caller was looking at). - * - * BTREE_INSERT_ATOMIC doesn't mean anything w.r.t. journal - * reservations: - */ - if (ret == -EINTR && !(trans->flags & BTREE_INSERT_ATOMIC)) { + if (ret == -EINTR) { trans_for_each_entry(trans, i) { - ret = bch_btree_iter_traverse(i->iter); - if (ret) + int ret2 = bch_btree_iter_traverse(i->iter); + if (ret2) { + ret = ret2; goto out; + } } - ret = 0; + /* + * BTREE_ITER_ATOMIC means we have to return -EINTR if we + * dropped locks: + */ + if (!(trans->flags & BTREE_INSERT_ATOMIC)) + goto retry; } - if (!ret) - goto retry; - goto out; } @@ -1859,22 +1828,26 @@ int bch_btree_update(struct cache_set *c, enum btree_id id, { struct btree_iter iter; struct bkey_s_c u; - int ret, ret2; + int ret; EBUG_ON(id == BTREE_ID_EXTENTS); bch_btree_iter_init_intent(&iter, c, id, k->k.p); u = bch_btree_iter_peek_with_holes(&iter); + ret = btree_iter_err(u); + if (ret) + return ret; - if (!u.k || bkey_deleted(u.k)) + if (bkey_deleted(u.k)) { + bch_btree_iter_unlock(&iter); return -ENOENT; + } ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, 0, BTREE_INSERT_ENTRY(&iter, k)); - ret2 = bch_btree_iter_unlock(&iter); - - return ret ?: ret2; + bch_btree_iter_unlock(&iter); + return ret; } /* @@ -1896,7 +1869,8 @@ int bch_btree_delete_range(struct cache_set *c, enum btree_id id, bch_btree_iter_init_intent(&iter, c, id, start); - while ((k = bch_btree_iter_peek(&iter)).k) { + while ((k = bch_btree_iter_peek(&iter)).k && + !(ret = btree_iter_err(k))) { unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); /* really shouldn't be using a bare, unpadded bkey_i */ struct bkey_i delete; @@ -1943,7 +1917,8 @@ int bch_btree_delete_range(struct cache_set *c, enum btree_id id, bch_btree_iter_cond_resched(&iter); } - return bch_btree_iter_unlock(&iter) ?: ret; + bch_btree_iter_unlock(&iter); + return ret; } /** @@ -1960,11 +1935,10 @@ int bch_btree_node_rewrite(struct btree_iter *iter, struct btree *b, struct btree_reserve *reserve; struct btree_interior_update *as; - iter->locks_want = U8_MAX; - if (!bch_btree_iter_upgrade(iter)) + if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) return -EINTR; - reserve = bch_btree_reserve_get(c, b, 1, false, cl); + reserve = bch_btree_reserve_get(c, b, 0, false, cl); if (IS_ERR(reserve)) { trace_bcache_btree_gc_rewrite_node_fail(b); return PTR_ERR(reserve); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index acbfcbdc3072..9dcdf99539e8 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -265,7 +265,8 @@ static ssize_t bch_read_btree(struct file *file, char __user *buf, bch_btree_iter_init(&iter, i->c, i->id, i->from); - while ((k = bch_btree_iter_peek(&iter)).k) { + while ((k = bch_btree_iter_peek(&iter)).k && + !(err = btree_iter_err(k))) { bch_bkey_val_to_text(i->c, bkey_type(0, i->id), i->buf, sizeof(i->buf), k); i->bytes = strlen(i->buf); @@ -408,7 +409,8 @@ static ssize_t bch_read_bfloat_failed(struct file *file, char __user *buf, bch_btree_iter_init(&iter, i->c, i->id, i->from); - while ((k = bch_btree_iter_peek(&iter)).k) { + while ((k = bch_btree_iter_peek(&iter)).k && + !(err = btree_iter_err(k))) { struct btree_keys *b = &iter.nodes[0]->keys; struct btree_node_iter *node_iter = &iter.node_iters[0]; struct bkey_packed *_k = bch_btree_node_iter_peek(node_iter, b); diff --git a/drivers/md/bcache/dirent.c b/drivers/md/bcache/dirent.c index 47b4a365a7d8..68fe648ca64a 100644 --- a/drivers/md/bcache/dirent.c +++ b/drivers/md/bcache/dirent.c @@ -222,6 +222,13 @@ int bch_dirent_rename(struct cache_set *c, bool need_whiteout; int ret = -ENOMEM; + bch_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos); + bch_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos); + bch_btree_iter_link(&src_iter, &dst_iter); + + bch_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos); + bch_btree_iter_link(&src_iter, &whiteout_iter); + if (mode == BCH_RENAME_EXCHANGE) { new_src = dirent_create_key(0, src_name, 0); if (!new_src) @@ -233,146 +240,112 @@ int bch_dirent_rename(struct cache_set *c, new_dst = dirent_create_key(0, dst_name, 0); if (!new_dst) goto err; +retry: + /* + * Note that on -EINTR/dropped locks we're not restarting the lookup + * from the original hashed position (like we do when creating dirents, + * in bch_hash_set) - we never move existing dirents to different slot: + */ + old_src = bch_hash_lookup_at(dirent_hash_desc, + &src_ei->str_hash, + &src_iter, src_name); + if ((ret = btree_iter_err(old_src))) + goto err; - bch_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos); - bch_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos); - bch_btree_iter_link(&src_iter, &dst_iter); - - do { - /* - * Have to traverse lower btree nodes before higher - due to - * lock ordering. - */ - if (bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { - old_src = bch_hash_lookup_at(dirent_hash_desc, - &src_ei->str_hash, - &src_iter, src_name); + ret = bch_hash_needs_whiteout(dirent_hash_desc, + &src_ei->str_hash, + &whiteout_iter, &src_iter); + if (ret < 0) + goto err; + need_whiteout = ret; + + /* + * Note that in BCH_RENAME mode, we're _not_ checking if + * the target already exists - we're relying on the VFS + * to do that check for us for correctness: + */ + old_dst = mode == BCH_RENAME + ? bch_hash_hole_at(dirent_hash_desc, &dst_iter) + : bch_hash_lookup_at(dirent_hash_desc, + &dst_ei->str_hash, + &dst_iter, dst_name); + if ((ret = btree_iter_err(old_dst))) + goto err; - need_whiteout = bch_hash_needs_whiteout(dirent_hash_desc, - &src_ei->str_hash, - &whiteout_iter, &src_iter); + switch (mode) { + case BCH_RENAME: + bkey_init(&new_src->k); + dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && + bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { /* - * Note that in BCH_RENAME mode, we're _not_ checking if - * the target already exists - we're relying on the VFS - * to do that check for us for correctness: + * If we couldn't insert new_dst at its hashed + * position (dst_pos) due to a hash collision, + * and we're going to be deleting in + * between the hashed position and first empty + * slot we found - just overwrite the pos we + * were going to delete: + * + * Note: this is a correctness issue, in this + * situation bch_hash_needs_whiteout() could + * return false when the whiteout would have + * been needed if we inserted at the pos + * __dirent_find_hole() found */ - old_dst = mode == BCH_RENAME - ? bch_hash_hole_at(dirent_hash_desc, &dst_iter) - : bch_hash_lookup_at(dirent_hash_desc, - &dst_ei->str_hash, - &dst_iter, dst_name); - } else { - old_dst = mode == BCH_RENAME - ? bch_hash_hole_at(dirent_hash_desc, &dst_iter) - : bch_hash_lookup_at(dirent_hash_desc, - &dst_ei->str_hash, - &dst_iter, dst_name); - - old_src = bch_hash_lookup_at(dirent_hash_desc, - &src_ei->str_hash, - &src_iter, src_name); - - need_whiteout = bch_hash_needs_whiteout(dirent_hash_desc, - &src_ei->str_hash, - &whiteout_iter, &src_iter); - } - - if (IS_ERR(old_src.k)) { - ret = PTR_ERR(old_src.k); - goto err_unlock; + new_dst->k.p = src_iter.pos; + ret = bch_btree_insert_at(c, NULL, NULL, + journal_seq, + BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(&src_iter, + &new_dst->k_i)); + goto err; } - if (IS_ERR(old_dst.k)) { - ret = PTR_ERR(old_dst.k); - goto err_unlock; - } + if (need_whiteout) + new_src->k.type = BCH_DIRENT_WHITEOUT; + break; + case BCH_RENAME_OVERWRITE: + bkey_init(&new_src->k); + dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - switch (mode) { - case BCH_RENAME: - bkey_init(&new_src->k); - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - - if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && - bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { - /* - * If we couldn't insert new_dst at its hashed - * position (dst_pos) due to a hash collision, - * and we're going to be deleting in - * between the hashed position and first empty - * slot we found - just overwrite the pos we - * were going to delete: - * - * Note: this is a correctness issue, in this - * situation bch_hash_needs_whiteout() could - * return false when the whiteout would have - * been needed if we inserted at the pos - * __dirent_find_hole() found - */ - new_dst->k.p = src_iter.pos; - ret = bch_btree_insert_at(c, NULL, NULL, - journal_seq, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&src_iter, - &new_dst->k_i)); - goto insert_done; - } - - if (need_whiteout) - new_src->k.type = BCH_DIRENT_WHITEOUT; - break; - case BCH_RENAME_OVERWRITE: - bkey_init(&new_src->k); - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - - if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && - bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { - /* - * Same case described above - - * bch_hash_needs_whiteout could spuriously - * return false, but we have to insert at - * dst_iter.pos because we're overwriting - * another dirent: - */ - new_src->k.type = BCH_DIRENT_WHITEOUT; - } else if (need_whiteout) - new_src->k.type = BCH_DIRENT_WHITEOUT; - break; - case BCH_RENAME_EXCHANGE: - dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - break; - } + if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && + bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { + /* + * Same case described above - + * bch_hash_needs_whiteout could spuriously + * return false, but we have to insert at + * dst_iter.pos because we're overwriting + * another dirent: + */ + new_src->k.type = BCH_DIRENT_WHITEOUT; + } else if (need_whiteout) + new_src->k.type = BCH_DIRENT_WHITEOUT; + break; + case BCH_RENAME_EXCHANGE: + dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); + dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + break; + } - new_src->k.p = src_iter.pos; - new_dst->k.p = dst_iter.pos; - ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i), - BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i)); -insert_done: - bch_btree_iter_unlink(&whiteout_iter); - bch_btree_iter_unlock(&src_iter); - bch_btree_iter_unlock(&dst_iter); - - if (bkey_cmp(src_iter.pos, src_pos) || - bkey_cmp(dst_iter.pos, dst_pos)) { - /* ugh */ - bch_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos); - bch_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos); - bch_btree_iter_link(&src_iter, &dst_iter); - } - } while (ret == -EINTR); + new_src->k.p = src_iter.pos; + new_dst->k.p = dst_iter.pos; + ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, + BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i), + BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i)); err: + if (ret == -EINTR) + goto retry; + + bch_btree_iter_unlock(&whiteout_iter); + bch_btree_iter_unlock(&dst_iter); + bch_btree_iter_unlock(&src_iter); + if (new_src != (void *) &delete) kfree(new_src); kfree(new_dst); return ret; -err_unlock: - bch_btree_iter_unlink(&whiteout_iter); - ret = bch_btree_iter_unlock(&src_iter) ?: ret; - ret = bch_btree_iter_unlock(&dst_iter) ?: ret; - goto err; } int bch_dirent_delete(struct inode *dir, const struct qstr *name) diff --git a/drivers/md/bcache/fs-gc.c b/drivers/md/bcache/fs-gc.c index d4a7c75a85a0..11c3657d22f8 100644 --- a/drivers/md/bcache/fs-gc.c +++ b/drivers/md/bcache/fs-gc.c @@ -281,8 +281,8 @@ static int bch_gc_walk_inodes(struct cache_set *c, struct nlinks *links, bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0)); genradix_iter_init(&nlinks_iter); - while (1) { - k = bch_btree_iter_peek(&iter); + while ((k = bch_btree_iter_peek(&iter)).k && + !btree_iter_err(k)) { peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); if (!link && (!k.k || iter.pos.inode >= range_end)) diff --git a/drivers/md/bcache/fs-io.c b/drivers/md/bcache/fs-io.c index 74d7578c9974..92d7a9bf8dcd 100644 --- a/drivers/md/bcache/fs-io.c +++ b/drivers/md/bcache/fs-io.c @@ -269,45 +269,41 @@ static int bchfs_write_index_update(struct bch_write_op *wop) struct keylist *keys = &op->op.insert_keys; struct btree_iter extent_iter, inode_iter; struct bchfs_extent_trans_hook hook; + struct bkey_i *k = bch_keylist_front(keys); int ret; - BUG_ON(bch_keylist_front(keys)->k.p.inode != op->ei->vfs_inode.i_ino); + BUG_ON(k->k.p.inode != op->ei->vfs_inode.i_ino); bch_btree_iter_init_intent(&extent_iter, wop->c, BTREE_ID_EXTENTS, bkey_start_pos(&bch_keylist_front(keys)->k)); bch_btree_iter_init_intent(&inode_iter, wop->c, BTREE_ID_INODES, POS(extent_iter.pos.inode, 0)); - bch_btree_iter_link(&extent_iter, &inode_iter); hook.op = op; hook.hook.fn = bchfs_extent_update_hook; hook.need_inode_update = false; do { - struct bkey_i *k = bch_keylist_front(keys); - - /* lock ordering... */ - bch_btree_iter_unlock(&inode_iter); - ret = bch_btree_iter_traverse(&extent_iter); if (ret) - break; + goto err; /* XXX: ei->i_size locking */ + k = bch_keylist_front(keys); if (min(k->k.p.offset << 9, op->new_i_size) > op->ei->i_size) hook.need_inode_update = true; if (hook.need_inode_update) { struct bkey_s_c inode; - ret = bch_btree_iter_traverse(&inode_iter); - if (ret) - break; + if (!btree_iter_linked(&inode_iter)) + bch_btree_iter_link(&extent_iter, &inode_iter); inode = bch_btree_iter_peek_with_holes(&inode_iter); + if ((ret = btree_iter_err(inode))) + goto err; - if (WARN_ONCE(!inode.k || - inode.k->type != BCH_INODE_FS, + if (WARN_ONCE(inode.k->type != BCH_INODE_FS, "inode %llu not found when updating", extent_iter.pos.inode)) { ret = -ENOENT; @@ -327,7 +323,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop) BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC, BTREE_INSERT_ENTRY(&extent_iter, k)); } - +err: if (ret == -EINTR) continue; if (ret) @@ -2058,16 +2054,13 @@ static long bch_fcollapse(struct inode *inode, loff_t offset, loff_t len) bch_btree_iter_set_pos(&src, POS(dst.pos.inode, dst.pos.offset + (len >> 9))); - /* Have to take intent locks before read locks: */ ret = bch_btree_iter_traverse(&dst); if (ret) - goto err_unwind; + goto btree_iter_err; k = bch_btree_iter_peek_with_holes(&src); - if (!k.k) { - ret = -EIO; - goto err_unwind; - } + if ((ret = btree_iter_err(k))) + goto btree_iter_err; bkey_reassemble(©.k, k); @@ -2089,11 +2082,11 @@ static long bch_fcollapse(struct inode *inode, loff_t offset, loff_t len) BTREE_INSERT_NOFAIL, BTREE_INSERT_ENTRY(&dst, ©.k)); bch_disk_reservation_put(c, &disk_res); - +btree_iter_err: if (ret < 0 && ret != -EINTR) goto err_unwind; - bch_btree_iter_unlock(&src); + bch_btree_iter_cond_resched(&src); } bch_btree_iter_unlock(&src); @@ -2195,10 +2188,8 @@ static long bch_fallocate(struct inode *inode, int mode, struct disk_reservation disk_res = { 0 }; k = bch_btree_iter_peek_with_holes(&iter); - if (!k.k) { - ret = bch_btree_iter_unlock(&iter) ?: -EIO; - goto err_put_sectors_dirty; - } + if ((ret = btree_iter_err(k))) + goto btree_iter_err; /* already reserved */ if (k.k->type == BCH_RESERVATION) { @@ -2237,7 +2228,7 @@ static long bch_fallocate(struct inode *inode, int mode, BTREE_INSERT_NOFAIL, BTREE_INSERT_ENTRY(&iter, &reservation)); bch_disk_reservation_put(c, &disk_res); - +btree_iter_err: if (ret < 0 && ret != -EINTR) goto err_put_sectors_dirty; diff --git a/drivers/md/bcache/fs.c b/drivers/md/bcache/fs.c index 00d85beee777..024459719e39 100644 --- a/drivers/md/bcache/fs.c +++ b/drivers/md/bcache/fs.c @@ -74,7 +74,10 @@ int __must_check __bch_write_inode(struct cache_set *c, do { struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter); - if (WARN_ONCE(!k.k || k.k->type != BCH_INODE_FS, + if ((ret = btree_iter_err(k))) + goto out; + + if (WARN_ONCE(k.k->type != BCH_INODE_FS, "inode %llu not found when updating", inum)) { bch_btree_iter_unlock(&iter); return -ENOENT; @@ -163,7 +166,8 @@ static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum) bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0)); k = bch_btree_iter_peek_with_holes(&iter); - if (!k.k || k.k->type != BCH_INODE_FS) { + + if ((ret = btree_iter_err(k)) || k.k->type != BCH_INODE_FS) { ret = bch_btree_iter_unlock(&iter); iget_failed(inode); return ERR_PTR(ret ?: -ENOENT); diff --git a/drivers/md/bcache/inode.c b/drivers/md/bcache/inode.c index 7cbe9b5dc757..9b2a4806805e 100644 --- a/drivers/md/bcache/inode.c +++ b/drivers/md/bcache/inode.c @@ -112,7 +112,6 @@ int bch_inode_create(struct cache_set *c, struct bkey_i *inode, u64 min, u64 max, u64 *hint) { struct btree_iter iter; - struct bkey_s_c k; bool searched_from_start = false; int ret; @@ -130,9 +129,14 @@ int bch_inode_create(struct cache_set *c, struct bkey_i *inode, again: bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(*hint, 0)); - while ((k = bch_btree_iter_peek_with_holes(&iter)).k) { - if (k.k->p.inode >= max) - break; + while (1) { + struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter); + + ret = btree_iter_err(k); + if (ret) { + bch_btree_iter_unlock(&iter); + return ret; + } if (k.k->type < BCH_INODE_FS) { inode->k.p = k.k->p; @@ -153,6 +157,8 @@ again: return ret; } else { + if (iter.pos.inode == max) + break; /* slot used */ bch_btree_iter_advance_pos(&iter); } diff --git a/drivers/md/bcache/migrate.c b/drivers/md/bcache/migrate.c index 363d2b02282a..5a26e2286504 100644 --- a/drivers/md/bcache/migrate.c +++ b/drivers/md/bcache/migrate.c @@ -95,7 +95,8 @@ int bch_move_data_off_device(struct cache *ca) bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); while (!bch_move_ctxt_wait(&ctxt) && - (k = bch_btree_iter_peek(&iter)).k) { + (k = bch_btree_iter_peek(&iter)).k && + !(ret = btree_iter_err(k))) { if (!bkey_extent_is_data(k.k) || !bch_extent_has_device(bkey_s_c_to_extent(k), ca->sb.nr_this_dev)) @@ -112,11 +113,8 @@ int bch_move_data_off_device(struct cache *ca) bch_move_ctxt_wait_for_io(&ctxt); continue; } - if (ret == -ENOSPC) { - bch_btree_iter_unlock(&iter); - bch_move_ctxt_exit(&ctxt); - return -ENOSPC; - } + if (ret == -ENOSPC) + break; BUG_ON(ret); seen_key_count++; @@ -125,7 +123,7 @@ next: bch_btree_iter_cond_resched(&iter); } - ret = bch_btree_iter_unlock(&iter); + bch_btree_iter_unlock(&iter); bch_move_ctxt_exit(&ctxt); if (ret) @@ -180,7 +178,7 @@ retry: return ret; } - iter.locks_want = 0; + bch_btree_iter_set_locks_want(&iter, 0); } ret = bch_btree_iter_unlock(&iter); if (ret) @@ -312,14 +310,15 @@ static int bch_flag_key_bad(struct btree_iter *iter, int bch_flag_data_bad(struct cache *ca) { - int ret = 0, ret2; + int ret = 0; struct bkey_s_c k; struct bkey_s_c_extent e; struct btree_iter iter; bch_btree_iter_init(&iter, ca->set, BTREE_ID_EXTENTS, POS_MIN); - while ((k = bch_btree_iter_peek(&iter)).k) { + while ((k = bch_btree_iter_peek(&iter)).k && + !(ret = btree_iter_err(k))) { if (!bkey_extent_is_data(k.k)) goto advance; @@ -364,7 +363,7 @@ advance: bch_btree_iter_advance_pos(&iter); } - ret2 = bch_btree_iter_unlock(&iter); + bch_btree_iter_unlock(&iter); - return ret ?: ret2; + return ret; } diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index d93adbd2e609..27f5d572107c 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -78,7 +78,8 @@ static void read_moving(struct cache *ca, size_t buckets_to_move) bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); while (!bch_move_ctxt_wait(&ctxt) && - (k = bch_btree_iter_peek(&iter)).k) { + (k = bch_btree_iter_peek(&iter)).k && + !btree_iter_err(k)) { if (!moving_pred(ca, k)) goto next; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 71883352abd8..c8ed80f58bf8 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -508,6 +508,8 @@ retry: /* not present (hole), or stale cached data */ if (cached_dev_cache_miss(&iter, s, bio, sectors)) { k = bch_btree_iter_peek_with_holes(&iter); + if (btree_iter_err(k)) + break; goto retry; } } diff --git a/drivers/md/bcache/str_hash.h b/drivers/md/bcache/str_hash.h index f6a441b7be00..9a718a8e9eaa 100644 --- a/drivers/md/bcache/str_hash.h +++ b/drivers/md/bcache/str_hash.h @@ -95,12 +95,13 @@ bch_hash_lookup_at(const struct bch_hash_desc desc, const struct bch_hash_info *info, struct btree_iter *iter, const void *search) { - struct bkey_s_c k; u64 inode = iter->pos.inode; - while ((k = bch_btree_iter_peek_with_holes(iter)).k) { - if (k.k->p.inode != inode) - break; + do { + struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter); + + if (btree_iter_err(k)) + return k; if (k.k->type == desc.key_type) { if (!desc.cmp_key(k, search)) @@ -113,7 +114,7 @@ bch_hash_lookup_at(const struct bch_hash_desc desc, } bch_btree_iter_advance_pos(iter); - } + } while (iter->pos.inode == inode); return bkey_s_c_err(-ENOENT); } @@ -123,12 +124,13 @@ bch_hash_lookup_bkey_at(const struct bch_hash_desc desc, const struct bch_hash_info *info, struct btree_iter *iter, struct bkey_s_c search) { - struct bkey_s_c k; u64 inode = iter->pos.inode; - while ((k = bch_btree_iter_peek_with_holes(iter)).k) { - if (k.k->p.inode != inode) - break; + do { + struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter); + + if (btree_iter_err(k)) + return k; if (k.k->type == desc.key_type) { if (!desc.cmp_bkey(k, search)) @@ -141,7 +143,7 @@ bch_hash_lookup_bkey_at(const struct bch_hash_desc desc, } bch_btree_iter_advance_pos(iter); - } + } while (iter->pos.inode == inode); return bkey_s_c_err(-ENOENT); } @@ -173,21 +175,20 @@ bch_hash_lookup_intent(const struct bch_hash_desc desc, static inline struct bkey_s_c bch_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter) { - struct bkey_s_c k; - u64 inode = iter->pos.inode; + while (1) { + struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter); - while ((k = bch_btree_iter_peek_with_holes(iter)).k) { - if (k.k->p.inode != inode) - break; + if (btree_iter_err(k)) + return k; if (k.k->type != desc.key_type) return k; /* hash collision, keep going */ bch_btree_iter_advance_pos(iter); + if (iter->pos.inode != k.k->p.inode) + return bkey_s_c_err(-ENOENT); } - - return bkey_s_c_err(-ENOSPC); } static inline struct bkey_s_c bch_hash_hole(const struct bch_hash_desc desc, @@ -202,21 +203,20 @@ static inline struct bkey_s_c bch_hash_hole(const struct bch_hash_desc desc, return bch_hash_hole_at(desc, iter); } -static inline bool bch_hash_needs_whiteout(const struct bch_hash_desc desc, +static inline int bch_hash_needs_whiteout(const struct bch_hash_desc desc, const struct bch_hash_info *info, struct btree_iter *iter, struct btree_iter *start) { - struct bkey_s_c k; - - bch_btree_iter_init_copy(iter, start); + bch_btree_iter_set_pos(iter, + btree_type_successor(start->btree_id, start->pos)); while (1) { - bch_btree_iter_advance_pos(iter); - k = bch_btree_iter_peek_with_holes(iter); + struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter); + int ret = btree_iter_err(k); - if (!k.k) - return iter->error ? true : false; + if (ret) + return ret; if (k.k->type != desc.key_type && k.k->type != desc.whiteout_type) @@ -225,6 +225,8 @@ static inline bool bch_hash_needs_whiteout(const struct bch_hash_desc desc, if (k.k->type == desc.key_type && desc.hash_bkey(info, k) <= start->pos.offset) return true; + + bch_btree_iter_advance_pos(iter); } } @@ -245,54 +247,57 @@ static inline int bch_hash_set(const struct bch_hash_desc desc, POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)))); bch_btree_iter_init_intent(&iter, c, desc.btree_id, hashed_slot.pos); bch_btree_iter_link(&hashed_slot, &iter); +retry: + /* + * On hash collision, we have to keep the slot we hashed to locked while + * we do the insert - to avoid racing with another thread deleting + * whatever's in the slot we hashed to: + */ + ret = bch_btree_iter_traverse(&hashed_slot); + if (ret) + goto err; - do { - ret = bch_btree_iter_traverse(&hashed_slot); - if (ret) - break; + /* + * On -EINTR/retry, we dropped locks - always restart from the slot we + * hashed to: + */ + bch_btree_iter_copy(&iter, &hashed_slot); + + k = bch_hash_lookup_bkey_at(desc, info, &iter, bkey_i_to_s_c(insert)); + + ret = btree_iter_err(k); + if (ret == -ENOENT) { + if (flags & BCH_HASH_SET_MUST_REPLACE) { + ret = -ENOENT; + goto err; + } /* - * If there's a hash collision and we have to insert at a - * different slot, on -EINTR (insert had to drop locks) we have - * to recheck the slot we hashed to - it could have been deleted - * while we dropped locks: + * Not found, so we're now looking for any open + * slot - we might have skipped over a whiteout + * that we could have used, so restart from the + * slot we hashed to: */ bch_btree_iter_copy(&iter, &hashed_slot); - k = bch_hash_lookup_bkey_at(desc, info, &iter, - bkey_i_to_s_c(insert)); - if (IS_ERR(k.k)) { - if (flags & BCH_HASH_SET_MUST_REPLACE) { - ret = -ENOENT; - goto err; - } - - /* - * Not found, so we're now looking for any open - * slot - we might have skipped over a whiteout - * that we could have used, so restart from the - * slot we hashed to: - */ - bch_btree_iter_copy(&iter, &hashed_slot); - k = bch_hash_hole_at(desc, &iter); - if (IS_ERR(k.k)) { - ret = PTR_ERR(k.k); - goto err; - } - } else { - if (flags & BCH_HASH_SET_MUST_CREATE) { - ret = -EEXIST; - goto err; - } + k = bch_hash_hole_at(desc, &iter); + if ((ret = btree_iter_err(k))) + goto err; + } else if (!ret) { + if (flags & BCH_HASH_SET_MUST_CREATE) { + ret = -EEXIST; + goto err; } + } else { + goto err; + } - insert->k.p = iter.pos; - ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&iter, insert)); - - /* unlock before traversing hashed_slot: */ - bch_btree_iter_unlock(&iter); - } while (ret == -EINTR); + insert->k.p = iter.pos; + ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, + BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(&iter, insert)); +err: + if (ret == -EINTR) + goto retry; /* * On successful insert, we don't want to clobber ret with error from @@ -301,10 +306,6 @@ static inline int bch_hash_set(const struct bch_hash_desc desc, bch_btree_iter_unlock(&iter); bch_btree_iter_unlock(&hashed_slot); return ret; -err: - ret = bch_btree_iter_unlock(&iter) ?: ret; - ret = bch_btree_iter_unlock(&hashed_slot) ?: ret; - return ret; } static inline int bch_hash_delete(const struct bch_hash_desc desc, @@ -319,34 +320,31 @@ static inline int bch_hash_delete(const struct bch_hash_desc desc, bch_btree_iter_init_intent(&iter, c, desc.btree_id, POS(inode, desc.hash_key(info, key))); + bch_btree_iter_init(&whiteout_iter, c, desc.btree_id, + POS(inode, desc.hash_key(info, key))); + bch_btree_iter_link(&iter, &whiteout_iter); +retry: + k = bch_hash_lookup_at(desc, info, &iter, key); + if ((ret = btree_iter_err(k))) + goto err; + + ret = bch_hash_needs_whiteout(desc, info, &whiteout_iter, &iter); + if (ret < 0) + goto err; + + bkey_init(&delete.k); + delete.k.p = k.k->p; + delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED; + + ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(&iter, &delete)); +err: + if (ret == -EINTR) + goto retry; - do { - k = bch_hash_lookup_at(desc, info, &iter, key); - if (IS_ERR(k.k)) - return bch_btree_iter_unlock(&iter) ?: -ENOENT; - - bkey_init(&delete.k); - delete.k.p = k.k->p; - delete.k.type = bch_hash_needs_whiteout(desc, info, - &whiteout_iter, &iter) - ? desc.whiteout_type - : KEY_TYPE_DELETED; - - ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&iter, &delete)); - /* - * Need to hold whiteout iter locked while we do the delete, if - * we're not leaving a whiteout: - */ - bch_btree_iter_unlink(&whiteout_iter); - /* - * XXX: if we ever cleanup whiteouts, we may need to rewind - * iterator on -EINTR - */ - } while (ret == -EINTR); - + bch_btree_iter_unlock(&whiteout_iter); bch_btree_iter_unlock(&iter); return ret; } diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c index baecaf4c3aa7..bbace2f1a6b1 100644 --- a/drivers/md/bcache/tier.c +++ b/drivers/md/bcache/tier.c @@ -127,7 +127,8 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier) bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); while (!bch_move_ctxt_wait(&ctxt) && - (k = bch_btree_iter_peek(&iter)).k) { + (k = bch_btree_iter_peek(&iter)).k && + !btree_iter_err(k)) { if (!tiering_pred(c, &s, k)) goto next; diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index e3c62520159e..6397b0f0ae69 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -542,35 +542,33 @@ TRACE_EVENT(bcache_mca_scan, ); DECLARE_EVENT_CLASS(mca_cannibalize_lock, - TP_PROTO(struct cache_set *c, struct closure *cl), - TP_ARGS(c, cl), + TP_PROTO(struct cache_set *c), + TP_ARGS(c), TP_STRUCT__entry( __array(char, uuid, 16 ) - __field(struct closure *, cl ) ), TP_fast_assign( memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); - __entry->cl = cl; ), - TP_printk("%pU cl %p", __entry->uuid, __entry->cl) + TP_printk("%pU", __entry->uuid) ); DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize_lock_fail, - TP_PROTO(struct cache_set *c, struct closure *cl), - TP_ARGS(c, cl) + TP_PROTO(struct cache_set *c), + TP_ARGS(c) ); DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize_lock, - TP_PROTO(struct cache_set *c, struct closure *cl), - TP_ARGS(c, cl) + TP_PROTO(struct cache_set *c), + TP_ARGS(c) ); DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize, - TP_PROTO(struct cache_set *c, struct closure *cl), - TP_ARGS(c, cl) + TP_PROTO(struct cache_set *c), + TP_ARGS(c) ); DEFINE_EVENT(cache_set, bcache_mca_cannibalize_unlock, @@ -578,46 +576,6 @@ DEFINE_EVENT(cache_set, bcache_mca_cannibalize_unlock, TP_ARGS(c) ); -DECLARE_EVENT_CLASS(btree_node_op, - TP_PROTO(struct btree *b, void *op), - TP_ARGS(b, op), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(u64, bucket ) - __field(u8, level ) - __field(u8, id ) - __field(void *, op ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, b->c->disk_sb.user_uuid.b, 16); - __entry->bucket = PTR_BUCKET_NR_TRACE(b->c, &b->key, 0); - __entry->level = b->level; - __entry->id = b->btree_id; - __entry->op = op; - ), - - TP_printk("%pU bucket %llu(%u) id %u op %p", - __entry->uuid, __entry->bucket, __entry->level, __entry->id, - __entry->op) -); - -DEFINE_EVENT(btree_node_op, bcache_btree_upgrade_lock, - TP_PROTO(struct btree *b, void *op), - TP_ARGS(b, op) -); - -DEFINE_EVENT(btree_node_op, bcache_btree_upgrade_lock_fail, - TP_PROTO(struct btree *b, void *op), - TP_ARGS(b, op) -); - -DEFINE_EVENT(btree_node_op, bcache_btree_intent_lock_fail, - TP_PROTO(struct btree *b, void *op), - TP_ARGS(b, op) -); - TRACE_EVENT(bcache_btree_insert_key, TP_PROTO(struct btree *b, struct bkey_i *k), TP_ARGS(b, k), diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 1c5111d4d5a6..01323c95024d 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -1092,7 +1092,7 @@ enum btree_id { #undef DEF_BTREE_ID -#define BTREE_MAX_DEPTH 4 +#define BTREE_MAX_DEPTH 4U /* Btree nodes */ |