summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/md/bcache/btree_cache.c85
-rw-r--r--drivers/md/bcache/btree_cache.h7
-rw-r--r--drivers/md/bcache/btree_io.c12
-rw-r--r--drivers/md/bcache/btree_iter.c476
-rw-r--r--drivers/md/bcache/btree_iter.h53
-rw-r--r--drivers/md/bcache/btree_locking.h59
-rw-r--r--drivers/md/bcache/btree_types.h6
-rw-r--r--drivers/md/bcache/btree_update.c108
-rw-r--r--drivers/md/bcache/debug.c6
-rw-r--r--drivers/md/bcache/dirent.c225
-rw-r--r--drivers/md/bcache/fs-gc.c4
-rw-r--r--drivers/md/bcache/fs-io.c45
-rw-r--r--drivers/md/bcache/fs.c8
-rw-r--r--drivers/md/bcache/inode.c14
-rw-r--r--drivers/md/bcache/migrate.c23
-rw-r--r--drivers/md/bcache/movinggc.c3
-rw-r--r--drivers/md/bcache/request.c2
-rw-r--r--drivers/md/bcache/str_hash.h192
-rw-r--r--drivers/md/bcache/tier.c3
-rw-r--r--include/trace/events/bcache.h60
-rw-r--r--include/uapi/linux/bcache.h2
21 files changed, 740 insertions, 653 deletions
diff --git a/drivers/md/bcache/btree_cache.c b/drivers/md/bcache/btree_cache.c
index 22dae330acdd..a997327eb37d 100644
--- a/drivers/md/bcache/btree_cache.c
+++ b/drivers/md/bcache/btree_cache.c
@@ -448,8 +448,8 @@ int mca_cannibalize_lock(struct cache_set *c, struct closure *cl)
goto success;
if (!cl) {
- trace_bcache_mca_cannibalize_lock_fail(c, cl);
- return -EINTR;
+ trace_bcache_mca_cannibalize_lock_fail(c);
+ return -ENOMEM;
}
closure_wait(&c->mca_wait, cl);
@@ -462,11 +462,11 @@ int mca_cannibalize_lock(struct cache_set *c, struct closure *cl)
goto success;
}
- trace_bcache_mca_cannibalize_lock_fail(c, cl);
+ trace_bcache_mca_cannibalize_lock_fail(c);
return -EAGAIN;
success:
- trace_bcache_mca_cannibalize_lock(c, cl);
+ trace_bcache_mca_cannibalize_lock(c);
return 0;
}
@@ -492,10 +492,9 @@ static struct btree *mca_cannibalize(struct cache_set *c)
}
}
-struct btree *mca_alloc(struct cache_set *c, struct closure *cl)
+struct btree *mca_alloc(struct cache_set *c)
{
struct btree *b;
- int ret;
u64 start_time = local_clock();
mutex_lock(&c->btree_cache_lock);
@@ -546,37 +545,34 @@ out:
return b;
err:
/* Try to cannibalize another cached btree node: */
- ret = mca_cannibalize_lock(c, cl);
- if (ret) {
+ if (c->btree_cache_alloc_lock == current) {
+ b = mca_cannibalize(c);
+ list_del_init(&b->list);
mutex_unlock(&c->btree_cache_lock);
- return ERR_PTR(ret);
- }
- b = mca_cannibalize(c);
- list_del_init(&b->list);
- mutex_unlock(&c->btree_cache_lock);
+ mca_hash_remove(c, b);
- mca_hash_remove(c, b);
+ trace_bcache_mca_cannibalize(c);
+ goto out;
+ }
- trace_bcache_mca_cannibalize(c, cl);
- goto out;
+ mutex_unlock(&c->btree_cache_lock);
+ return ERR_PTR(-ENOMEM);
}
/* Slowpath, don't want it inlined into btree_iter_traverse() */
static noinline struct btree *bch_btree_node_fill(struct btree_iter *iter,
const struct bkey_i *k,
unsigned level,
- struct closure *cl)
+ enum six_lock_type lock_type)
{
struct cache_set *c = iter->c;
struct btree *b;
- b = mca_alloc(c, cl);
+ b = mca_alloc(c);
if (IS_ERR(b))
return b;
- mca_cannibalize_unlock(c);
-
bkey_copy(&b->key, k);
if (mca_hash_insert(c, b, level, iter->btree_id)) {
/* raced with another fill: */
@@ -608,9 +604,7 @@ static noinline struct btree *bch_btree_node_fill(struct btree_iter *iter,
bch_btree_node_read(c, b);
six_unlock_write(&b->lock);
- mark_btree_node_locked(iter, level, btree_lock_want(iter, level));
-
- if (btree_lock_want(iter, level) == SIX_LOCK_read)
+ if (lock_type == SIX_LOCK_read)
BUG_ON(!six_trylock_convert(&b->lock,
SIX_LOCK_intent,
SIX_LOCK_read));
@@ -629,7 +623,7 @@ static noinline struct btree *bch_btree_node_fill(struct btree_iter *iter,
*/
struct btree *bch_btree_node_get(struct btree_iter *iter,
const struct bkey_i *k, unsigned level,
- struct closure *cl)
+ enum six_lock_type lock_type)
{
int i = 0;
struct btree *b;
@@ -646,16 +640,14 @@ retry:
* else we could read in a btree node from disk that's been
* freed:
*/
- b = bch_btree_node_fill(iter, k, level, cl);
+ b = bch_btree_node_fill(iter, k, level, lock_type);
/* We raced and found the btree node in the cache */
if (!b)
goto retry;
- if (IS_ERR(b)) {
- BUG_ON(PTR_ERR(b) != -EAGAIN);
+ if (IS_ERR(b))
return b;
- }
} else {
/*
* There's a potential deadlock with splits and insertions into
@@ -678,8 +670,8 @@ retry:
* free it:
*
* To guard against this, btree nodes are evicted from the cache
- * when they're freed - and PTR_HASH() is zeroed out, which will
- * cause our btree_node_lock() call below to fail.
+ * when they're freed - and PTR_HASH() is zeroed out, which we
+ * check for after we lock the node.
*
* Then, btree_node_relock() on the parent will fail - because
* the parent was modified, when the pointer to the node we want
@@ -688,34 +680,35 @@ retry:
if (btree_node_read_locked(iter, level + 1))
btree_node_unlock(iter, level + 1);
- if (!btree_node_lock(b, iter, level,
- PTR_HASH(&b->key) != PTR_HASH(k))) {
- if (!btree_node_relock(iter, level + 1)) {
- trace_bcache_btree_intent_lock_fail(b, iter);
- return ERR_PTR(-EINTR);
- }
+ if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
+ return ERR_PTR(-EINTR);
- goto retry;
- }
+ if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
+ b->level != level ||
+ race_fault())) {
+ six_unlock_type(&b->lock, lock_type);
+ if (btree_node_relock(iter, level + 1))
+ goto retry;
- BUG_ON(b->level != level);
+ return ERR_PTR(-EINTR);
+ }
}
- /* avoid atomic set bit if it's not needed: */
- if (btree_node_accessed(b))
- set_btree_node_accessed(b);
-
for (; i <= b->keys.nsets; i++) {
prefetch(b->keys.set[i].tree);
prefetch(b->keys.set[i].data);
}
- if (btree_node_read_error(b)) {
- __btree_node_unlock(iter, level, b);
+ /* avoid atomic set bit if it's not needed: */
+ if (btree_node_accessed(b))
+ set_btree_node_accessed(b);
+
+ if (unlikely(btree_node_read_error(b))) {
+ six_unlock_type(&b->lock, lock_type);
return ERR_PTR(-EIO);
}
- BUG_ON(!b->written);
+ EBUG_ON(!b->written);
return b;
}
diff --git a/drivers/md/bcache/btree_cache.h b/drivers/md/bcache/btree_cache.h
index e3950bf4cfb3..5beec6e4e942 100644
--- a/drivers/md/bcache/btree_cache.h
+++ b/drivers/md/bcache/btree_cache.h
@@ -17,11 +17,10 @@ int mca_hash_insert(struct cache_set *, struct btree *,
void mca_cannibalize_unlock(struct cache_set *);
int mca_cannibalize_lock(struct cache_set *, struct closure *);
-struct btree *mca_alloc(struct cache_set *, struct closure *);
+struct btree *mca_alloc(struct cache_set *);
-struct btree *bch_btree_node_get(struct btree_iter *,
- const struct bkey_i *, unsigned,
- struct closure *);
+struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *,
+ unsigned, enum six_lock_type);
void bch_btree_cache_free(struct cache_set *);
int bch_btree_cache_alloc(struct cache_set *);
diff --git a/drivers/md/bcache/btree_io.c b/drivers/md/bcache/btree_io.c
index c343811d9bc9..1033232307f4 100644
--- a/drivers/md/bcache/btree_io.c
+++ b/drivers/md/bcache/btree_io.c
@@ -455,13 +455,19 @@ int bch_btree_root_read(struct cache_set *c, enum btree_id id,
{
struct closure cl;
struct btree *b;
+ int ret;
closure_init_stack(&cl);
- while (IS_ERR(b = mca_alloc(c, &cl))) {
- BUG_ON(PTR_ERR(b) != -EAGAIN);
+ do {
+ ret = mca_cannibalize_lock(c, &cl);
closure_sync(&cl);
- }
+ } while (ret);
+
+ b = mca_alloc(c);
+ mca_cannibalize_unlock(c);
+
+ BUG_ON(IS_ERR(b));
bkey_copy(&b->key, k);
BUG_ON(mca_hash_insert(c, b, level, id));
diff --git a/drivers/md/bcache/btree_iter.c b/drivers/md/bcache/btree_iter.c
index f0e729b7f6eb..d6ab2794254b 100644
--- a/drivers/md/bcache/btree_iter.c
+++ b/drivers/md/bcache/btree_iter.c
@@ -124,29 +124,164 @@ success:
return true;
}
+/* Slowpath: */
+bool __bch_btree_node_lock(struct btree *b, struct bpos pos,
+ unsigned level,
+ struct btree_iter *iter,
+ enum six_lock_type type)
+{
+ struct btree_iter *linked;
+
+ /* Can't have children locked before ancestors: */
+ EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
+
+ /*
+ * Can't hold any read locks while we block taking an intent lock - see
+ * below for reasoning, and we should have already dropped any read
+ * locks in the current iterator
+ */
+ EBUG_ON(type == SIX_LOCK_intent &&
+ iter->nodes_locked != iter->nodes_intent_locked);
+
+ for_each_linked_btree_iter(iter, linked)
+ if (linked->nodes[level] == b &&
+ btree_node_locked_type(linked, level) == type) {
+ six_lock_increment(&b->lock, type);
+ return true;
+ }
+
+ /*
+ * Must lock btree nodes in key order - this case hapens when locking
+ * the prev sibling in btree node merging:
+ */
+ if (iter->nodes_locked &&
+ __ffs(iter->nodes_locked) == level &&
+ __btree_iter_cmp(iter->btree_id, pos, iter))
+ return false;
+
+ for_each_linked_btree_iter(iter, linked) {
+ if (!linked->nodes_locked)
+ continue;
+
+ /*
+ * Can't block taking an intent lock if we have _any_ nodes read
+ * locked:
+ *
+ * - Our read lock blocks another thread with an intent lock on
+ * the same node from getting a write lock, and thus from
+ * dropping its intent lock
+ *
+ * - And the other thread may have multiple nodes intent locked:
+ * both the node we want to intent lock, and the node we
+ * already have read locked - deadlock:
+ */
+ if (type == SIX_LOCK_intent &&
+ linked->nodes_locked != linked->nodes_intent_locked) {
+ linked->locks_want = max(linked->locks_want,
+ iter->locks_want);
+ return false;
+ }
+
+ /* We have to lock btree nodes in key order: */
+ if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
+ return false;
+
+ /*
+ * Interior nodes must be locked before their descendants: if
+ * another iterator has possible descendants locked of the node
+ * we're about to lock, it must have the ancestors locked too:
+ */
+ if (linked->btree_id == iter->btree_id &&
+ level > __fls(linked->nodes_locked)) {
+ linked->locks_want = max(linked->locks_want,
+ iter->locks_want);
+ return false;
+ }
+ }
+
+ six_lock_type(&b->lock, type);
+ return true;
+}
+
/* Btree iterator locking: */
-bool bch_btree_iter_upgrade(struct btree_iter *iter)
+
+static void btree_iter_drop_extra_locks(struct btree_iter *iter)
{
- int i;
+ unsigned l;
+
+ while (iter->nodes_locked &&
+ (l = __fls(iter->nodes_locked)) > iter->locks_want) {
+ if (!btree_node_locked(iter, l))
+ panic("l %u nodes_locked %u\n", l, iter->nodes_locked);
+
+ if (l > iter->level) {
+ btree_node_unlock(iter, l);
+ } else if (btree_node_intent_locked(iter, l)) {
+ BUG_ON(!six_trylock_convert(&iter->nodes[l]->lock,
+ SIX_LOCK_intent,
+ SIX_LOCK_read));
+ iter->nodes_intent_locked ^= 1 << l;
+ }
+ }
+}
- for (i = iter->level; i < iter->locks_want && iter->nodes[i]; i++)
- if (!btree_node_relock(iter, i))
- return false;
+bool __bch_btree_iter_set_locks_want(struct btree_iter *iter,
+ unsigned new_locks_want)
+{
+ struct btree_iter *linked;
+ unsigned l;
+
+ /* Drop locks we don't want anymore: */
+ if (new_locks_want < iter->locks_want)
+ for_each_linked_btree_iter(iter, linked)
+ if (linked->locks_want > new_locks_want) {
+ linked->locks_want = max_t(unsigned, 1,
+ new_locks_want);
+ btree_iter_drop_extra_locks(linked);
+ }
+
+ iter->locks_want = new_locks_want;
+ btree_iter_drop_extra_locks(iter);
+
+ for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
+ if (!btree_node_relock(iter, l))
+ goto fail;
return true;
+fail:
+ /*
+ * Just an optimization: ancestor nodes must be locked before child
+ * nodes, so set locks_want on iterators that might lock ancestors
+ * before us to avoid getting -EINTR later:
+ */
+ for_each_linked_btree_iter(iter, linked)
+ if (linked->btree_id == iter->btree_id &&
+ btree_iter_cmp(linked, iter) <= 0)
+ linked->locks_want = max_t(unsigned, linked->locks_want,
+ new_locks_want);
+ return false;
}
-int bch_btree_iter_unlock(struct btree_iter *iter)
+static int __bch_btree_iter_unlock(struct btree_iter *iter)
{
- unsigned l = 0;
+ BUG_ON(iter->error == -EINTR);
while (iter->nodes_locked)
- btree_node_unlock(iter, l++);
+ btree_node_unlock(iter, __ffs(iter->nodes_locked));
return iter->error;
}
+int bch_btree_iter_unlock(struct btree_iter *iter)
+{
+ struct btree_iter *linked;
+
+ for_each_linked_btree_iter(iter, linked)
+ __bch_btree_iter_unlock(linked);
+ return __bch_btree_iter_unlock(iter);
+}
+
/* Btree iterator: */
#ifdef CONFIG_BCACHE_DEBUG
@@ -283,6 +418,8 @@ static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter)
&iter->nodes[iter->level]->keys);
struct bkey_s_c ret;
+ EBUG_ON(!btree_node_locked(iter, iter->level));
+
if (!k)
return bkey_s_c_null;
@@ -302,6 +439,8 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter)
&iter->nodes[iter->level]->keys);
struct bkey_s_c ret;
+ EBUG_ON(!btree_node_locked(iter, iter->level));
+
if (!k)
return bkey_s_c_null;
@@ -320,15 +459,14 @@ static inline void __btree_iter_advance(struct btree_iter *iter)
}
static inline void btree_iter_node_set(struct btree_iter *iter,
- struct btree *b,
- struct bpos pos)
+ struct btree *b)
{
BUG_ON(b->lock.state.seq & 1);
iter->lock_seq[b->level] = b->lock.state.seq;
iter->nodes[b->level] = b;
bch_btree_node_iter_init(&iter->node_iters[b->level], &b->keys,
- pos, iter->is_extents);
+ iter->pos, iter->is_extents);
}
static bool btree_iter_pos_in_node(struct btree_iter *iter, struct btree *b)
@@ -371,7 +509,7 @@ bool bch_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
mark_btree_node_intent_locked(linked, b->level);
}
- btree_iter_node_set(linked, b, linked->pos);
+ btree_iter_node_set(linked, b);
}
if (!btree_iter_pos_in_node(iter, b)) {
@@ -380,7 +518,7 @@ bool bch_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
}
mark_btree_node_intent_locked(iter, b->level);
- btree_iter_node_set(iter, b, iter->pos);
+ btree_iter_node_set(iter, b);
return true;
}
@@ -425,86 +563,71 @@ void bch_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
iter->pos, iter->is_extents);
}
-static void btree_iter_verify_locking(struct btree_iter *iter, unsigned level)
-{
-#ifdef CONFIG_BCACHE_DEBUG
- struct btree_iter *linked;
-
- if (!btree_want_intent(iter, level))
- return;
-
- /*
- * Can't hold _any_ read locks (including in linked iterators) when
- * taking intent locks, that leads to a fun deadlock involving write
- * locks and journal reservations
- *
- * We could conceivably drop read locks, then retake them and if
- * retaking fails then return -EINTR... but, let's keep things simple
- * for now:
- */
- BUG_ON(iter->nodes_locked != iter->nodes_intent_locked);
-
- for_each_linked_btree_iter(iter, linked)
- BUG_ON(linked->nodes_locked != linked->nodes_intent_locked);
-
- /* Lock ordering: */
- for_each_linked_btree_iter(iter, linked) {
- BUG_ON(linked->nodes_locked &&
- btree_iter_cmp(linked, iter) > 0);
-
- BUG_ON(linked->nodes_locked &&
- linked->btree_id == iter->btree_id &&
- level > __fls(linked->nodes_locked));
- }
-#endif
-}
-
-static inline void btree_iter_lock_root(struct btree_iter *iter, struct bpos pos,
- unsigned l)
+static inline int btree_iter_lock_root(struct btree_iter *iter,
+ unsigned depth_want)
{
struct cache_set *c = iter->c;
+ struct btree *b;
+ enum six_lock_type lock_type;
+ unsigned i;
- iter->nodes_locked = 0;
- iter->nodes_intent_locked = 0;
- memset(iter->nodes, 0, sizeof(iter->nodes));
+ EBUG_ON(iter->nodes_locked);
while (1) {
- struct btree *b = READ_ONCE(c->btree_roots[iter->btree_id].b);
-
+ b = READ_ONCE(c->btree_roots[iter->btree_id].b);
iter->level = READ_ONCE(b->level);
- if (iter->level < l) {
- iter->level = l;
- break;
+ if (iter->level < depth_want) {
+ /*
+ * the root is at a lower depth than the depth we want:
+ * got to the end of the btree, or we're walking nodes
+ * greater than some depth and there are no nodes >=
+ * that depth
+ */
+ iter->level = depth_want;
+ iter->nodes[iter->level] = NULL;
+ return 0;
}
- btree_iter_verify_locking(iter, iter->level);
+ lock_type = btree_lock_want(iter, iter->level);
+ if (!btree_node_lock(b, POS_MAX, iter->level,
+ iter, lock_type))
+ return -EINTR;
+
+ if (b == c->btree_roots[iter->btree_id].b &&
+ b->level == iter->level &&
+ !race_fault()) {
+ for (i = 0; i < iter->level; i++)
+ iter->nodes[i] = BTREE_ITER_NOT_END;
+ iter->nodes[iter->level] = b;
+
+ mark_btree_node_locked(iter, iter->level, lock_type);
+ btree_iter_node_set(iter, b);
+ return 0;
- if (btree_node_lock(b, iter, iter->level,
- (b != c->btree_roots[iter->btree_id].b))) {
- btree_iter_node_set(iter, b, pos);
- break;
}
+
+ six_unlock_type(&b->lock, lock_type);
}
}
-static inline int btree_iter_down(struct btree_iter *iter, struct bpos pos,
- struct closure *cl)
+static inline int btree_iter_down(struct btree_iter *iter)
{
struct btree *b;
struct bkey_s_c k = __btree_iter_peek(iter);
+ unsigned level = iter->level - 1;
+ enum six_lock_type lock_type = btree_lock_want(iter, level);
BKEY_PADDED(k) tmp;
bkey_reassemble(&tmp.k, k);
- b = bch_btree_node_get(iter, &tmp.k, iter->level - 1, cl);
+ b = bch_btree_node_get(iter, &tmp.k, level, lock_type);
if (unlikely(IS_ERR(b)))
return PTR_ERR(b);
- btree_iter_verify_locking(iter, iter->level - 1);
-
- --iter->level;
- btree_iter_node_set(iter, b, pos);
+ iter->level = level;
+ mark_btree_node_locked(iter, level, lock_type);
+ btree_iter_node_set(iter, b);
return 0;
}
@@ -513,6 +636,84 @@ static void btree_iter_up(struct btree_iter *iter)
btree_node_unlock(iter, iter->level++);
}
+int __must_check __bch_btree_iter_traverse(struct btree_iter *);
+
+static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
+{
+ struct cache_set *c = iter->c;
+ struct btree_iter *linked, *sorted_iters, **i;
+retry_all:
+ for_each_linked_btree_iter(iter, linked)
+ bch_btree_iter_unlock(linked);
+ bch_btree_iter_unlock(iter);
+
+ if (ret != -ENOMEM && ret != -EINTR)
+ goto io_error;
+
+ if (ret == -ENOMEM) {
+ struct closure cl;
+
+ closure_init_stack(&cl);
+
+ do {
+ ret = mca_cannibalize_lock(c, &cl);
+ closure_sync(&cl);
+ } while (ret);
+ }
+
+ /*
+ * Linked iters are normally a circular singly linked list - break cycle
+ * while we sort them:
+ */
+ linked = iter->next;
+ iter->next = NULL;
+ sorted_iters = NULL;
+
+ while (linked) {
+ iter = linked;
+ linked = linked->next;
+
+ i = &sorted_iters;
+ while (*i && btree_iter_cmp(iter, *i) > 0)
+ i = &(*i)->next;
+
+ iter->next = *i;
+ *i = iter;
+ }
+
+ /* Make list circular again: */
+ iter = sorted_iters;
+ while (iter->next)
+ iter = iter->next;
+ iter->next = sorted_iters;
+
+ /* Now, redo traversals in correct order: */
+
+ iter = sorted_iters;
+ do {
+retry:
+ ret = __bch_btree_iter_traverse(iter);
+ if (unlikely(ret)) {
+ if (ret == -EINTR)
+ goto retry;
+ goto retry_all;
+ }
+
+ iter = iter->next;
+ } while (iter != sorted_iters);
+
+ ret = btree_iter_linked(iter) ? -EINTR : 0;
+out:
+ mca_cannibalize_unlock(c);
+ return ret;
+io_error:
+ BUG_ON(ret != -EIO);
+
+ iter->error = ret;
+ iter->nodes[iter->level] = NULL;
+ goto out;
+}
+
/*
* This is the main state machine for walking down the btree - walks down to a
* specified depth
@@ -522,14 +723,10 @@ static void btree_iter_up(struct btree_iter *iter)
* On error, caller (peek_node()/peek_key()) must return NULL; the error is
* stashed in the iterator and returned from bch_btree_iter_unlock().
*/
-static int __must_check __bch_btree_iter_traverse(struct btree_iter *iter,
- unsigned l, struct bpos pos)
+int __must_check __bch_btree_iter_traverse(struct btree_iter *iter)
{
- if (!iter->nodes[iter->level])
- return 0;
+ unsigned depth_want = iter->level;
- iter->at_end_of_leaf = false;
-retry:
/* make sure we have all the intent locks we need - ugh */
if (unlikely(iter->nodes[iter->level] &&
iter->level + 1 < iter->locks_want)) {
@@ -566,7 +763,7 @@ retry:
struct bkey_s_c k;
while ((k = __btree_iter_peek_all(iter)).k &&
- !btree_iter_pos_cmp(pos, k.k, iter->is_extents))
+ !btree_iter_pos_cmp(iter->pos, k.k, iter->is_extents))
__btree_iter_advance(iter);
}
@@ -576,35 +773,33 @@ retry:
* here it indicates that relocking the root failed - it's critical that
* btree_iter_lock_root() comes next and that it can't fail
*/
- while (iter->level > l)
- if (iter->nodes[iter->level]) {
- struct closure cl;
- int ret;
-
- closure_init_stack(&cl);
-
- ret = btree_iter_down(iter, pos, &cl);
- if (unlikely(ret)) {
- bch_btree_iter_unlock(iter);
- closure_sync(&cl);
-
- if (ret == -EAGAIN || ret == -EINTR)
- goto retry;
-
- iter->error = ret;
- iter->level = BTREE_MAX_DEPTH;
- return ret;
- }
- } else {
- btree_iter_lock_root(iter, pos, l);
+ while (iter->level > depth_want) {
+ int ret = iter->nodes[iter->level]
+ ? btree_iter_down(iter)
+ : btree_iter_lock_root(iter, depth_want);
+ if (unlikely(ret)) {
+ iter->level = depth_want;
+ return ret;
}
+ }
return 0;
}
int __must_check bch_btree_iter_traverse(struct btree_iter *iter)
{
- return __bch_btree_iter_traverse(iter, iter->level, iter->pos);
+ int ret;
+
+ if (unlikely(!iter->nodes[iter->level]))
+ return 0;
+
+ iter->at_end_of_leaf = false;
+
+ ret = __bch_btree_iter_traverse(iter);
+ if (unlikely(ret))
+ ret = btree_iter_traverse_error(iter, ret);
+
+ return ret;
}
/* Iterate across nodes (leaf and interior nodes) */
@@ -648,11 +843,15 @@ struct btree *bch_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
return NULL;
b = iter->nodes[iter->level];
+ if (!b)
+ return b;
if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
- struct bpos pos = bkey_successor(iter->pos);
+ /* Haven't gotten to the end of the parent node: */
+ iter->pos = bkey_successor(iter->pos);
+ iter->level = depth;
- ret = __bch_btree_iter_traverse(iter, depth, pos);
+ ret = bch_btree_iter_traverse(iter);
if (ret)
return NULL;
@@ -672,6 +871,7 @@ void bch_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_p
struct btree_node_iter *node_iter = &iter->node_iters[0];
struct bkey_packed *k;
+ EBUG_ON(iter->level != 0);
EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
EBUG_ON(!btree_node_locked(iter, 0));
EBUG_ON(bkey_cmp(new_pos, iter->nodes[0]->key.k.p) > 0);
@@ -691,7 +891,7 @@ void bch_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_p
void bch_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
- EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
+ EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */
iter->pos = new_pos;
}
@@ -722,13 +922,14 @@ void bch_btree_iter_rewind(struct btree_iter *iter, struct bpos pos)
struct bkey_s_c bch_btree_iter_peek(struct btree_iter *iter)
{
struct bkey_s_c k;
- struct bpos pos = iter->pos;
int ret;
while (1) {
- ret = __bch_btree_iter_traverse(iter, 0, pos);
- if (unlikely(ret))
- break;
+ ret = bch_btree_iter_traverse(iter);
+ if (unlikely(ret)) {
+ iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
+ return bkey_s_c_err(ret);
+ }
k = __btree_iter_peek(iter);
if (likely(k.k)) {
@@ -742,16 +943,16 @@ struct bkey_s_c bch_btree_iter_peek(struct btree_iter *iter)
return k;
}
- pos = iter->nodes[0]->key.k.p;
+ iter->pos = iter->nodes[0]->key.k.p;
- if (!bkey_cmp(pos, POS_MAX))
- break;
+ if (!bkey_cmp(iter->pos, POS_MAX)) {
+ iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
+ bch_btree_iter_unlock(iter);
+ return bkey_s_c_null;
+ }
- pos = btree_type_successor(iter->btree_id, pos);
+ iter->pos = btree_type_successor(iter->btree_id, iter->pos);
}
-
- iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
- return bkey_s_c_null;
}
struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *iter)
@@ -761,10 +962,10 @@ struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *iter)
int ret;
while (1) {
- ret = __bch_btree_iter_traverse(iter, 0, iter->pos);
+ ret = bch_btree_iter_traverse(iter);
if (unlikely(ret)) {
iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
- return bkey_s_c_null;
+ return bkey_s_c_err(ret);
}
k = __btree_iter_peek_all(iter);
@@ -805,70 +1006,43 @@ recheck:
void __bch_btree_iter_init(struct btree_iter *iter, struct cache_set *c,
enum btree_id btree_id, struct bpos pos,
- int locks_want, unsigned depth)
+ unsigned locks_want, unsigned depth)
{
iter->level = depth;
iter->is_extents = btree_id == BTREE_ID_EXTENTS;
iter->nodes_locked = 0;
iter->nodes_intent_locked = 0;
- iter->locks_want = locks_want;
+ iter->locks_want = min(locks_want, BTREE_MAX_DEPTH);
iter->btree_id = btree_id;
iter->at_end_of_leaf = 0;
iter->error = 0;
iter->c = c;
iter->pos = pos;
+ memset(iter->nodes, 0, sizeof(iter->nodes));
iter->nodes[iter->level] = BTREE_ITER_NOT_END;
- iter->nodes[iter->level + 1] = NULL;
iter->next = iter;
}
-int bch_btree_iter_unlink(struct btree_iter *iter)
+void bch_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
{
- struct btree_iter *linked;
- int ret = bch_btree_iter_unlock(iter);
+ BUG_ON(btree_iter_linked(new));
- for_each_linked_btree_iter(iter, linked)
- if (linked->next == iter) {
- linked->next = iter->next;
- iter->next = iter;
- return ret;
- }
-
- BUG();
-}
-
-void bch_btree_iter_link(struct btree_iter *iter, struct btree_iter *linked)
-{
- BUG_ON(linked->next != linked);
-
- linked->next = iter->next;
- iter->next = linked;
+ new->next = iter->next;
+ iter->next = new;
if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
unsigned nr_iters = 1;
- for_each_linked_btree_iter(iter, linked)
+ for_each_linked_btree_iter(iter, new)
nr_iters++;
BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE);
}
}
-static void __bch_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
-{
- memcpy(dst, src, offsetof(struct btree_iter, next));
- dst->nodes_locked = dst->nodes_intent_locked = 0;
-}
-
void bch_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
{
bch_btree_iter_unlock(dst);
- __bch_btree_iter_copy(dst, src);
-}
-
-void bch_btree_iter_init_copy(struct btree_iter *dst, struct btree_iter *src)
-{
- __bch_btree_iter_copy(dst, src);
- dst->next = dst;
- bch_btree_iter_link(src, dst);
+ memcpy(dst, src, offsetof(struct btree_iter, next));
+ dst->nodes_locked = dst->nodes_intent_locked = 0;
}
diff --git a/drivers/md/bcache/btree_iter.h b/drivers/md/bcache/btree_iter.h
index 694be63f9f1c..98353341d286 100644
--- a/drivers/md/bcache/btree_iter.h
+++ b/drivers/md/bcache/btree_iter.h
@@ -66,9 +66,16 @@ struct btree_iter {
* locked, or read and write locked, at the same time), and insertions
* through one iterator won't invalidate the other linked iterators.
*/
+
+ /* Must come last: */
struct btree_iter *next;
};
+static inline bool btree_iter_linked(const struct btree_iter *iter)
+{
+ return iter->next != iter;
+}
+
/**
* for_each_linked_btree_iter - iterate over all iterators linked with @_iter
*/
@@ -125,8 +132,20 @@ void bch_btree_node_iter_fix(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bset_tree *,
struct bkey_packed *, unsigned, unsigned);
-bool bch_btree_iter_upgrade(struct btree_iter *);
int bch_btree_iter_unlock(struct btree_iter *);
+bool __bch_btree_iter_set_locks_want(struct btree_iter *, unsigned);
+
+static inline bool bch_btree_iter_set_locks_want(struct btree_iter *iter,
+ unsigned new_locks_want)
+{
+ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+ if (iter->locks_want == new_locks_want &&
+ iter->nodes_intent_locked == (1 << new_locks_want) - 1)
+ return true;
+
+ return __bch_btree_iter_set_locks_want(iter, new_locks_want);
+}
bool bch_btree_iter_node_replace(struct btree_iter *, struct btree *);
void bch_btree_iter_node_drop_linked(struct btree_iter *, struct btree *);
@@ -147,7 +166,7 @@ void bch_btree_iter_advance_pos(struct btree_iter *);
void bch_btree_iter_rewind(struct btree_iter *, struct bpos);
void __bch_btree_iter_init(struct btree_iter *, struct cache_set *,
- enum btree_id, struct bpos, int, unsigned);
+ enum btree_id, struct bpos, unsigned , unsigned);
static inline void bch_btree_iter_init(struct btree_iter *iter,
struct cache_set *c,
@@ -165,10 +184,8 @@ static inline void bch_btree_iter_init_intent(struct btree_iter *iter,
__bch_btree_iter_init(iter, c, btree_id, pos, 1, 0);
}
-int bch_btree_iter_unlink(struct btree_iter *);
void bch_btree_iter_link(struct btree_iter *, struct btree_iter *);
void bch_btree_iter_copy(struct btree_iter *, struct btree_iter *);
-void bch_btree_iter_init_copy(struct btree_iter *, struct btree_iter *);
static inline struct bpos btree_type_successor(enum btree_id id,
struct bpos pos)
@@ -183,12 +200,19 @@ static inline struct bpos btree_type_successor(enum btree_id id,
return pos;
}
+static inline int __btree_iter_cmp(enum btree_id id,
+ struct bpos pos,
+ const struct btree_iter *r)
+{
+ if (id != r->btree_id)
+ return id < r->btree_id ? -1 : 1;
+ return bkey_cmp(pos, r->pos);
+}
+
static inline int btree_iter_cmp(const struct btree_iter *l,
const struct btree_iter *r)
{
- if (l->btree_id != r->btree_id)
- return l->btree_id < r->btree_id ? -1 : 1;
- return bkey_cmp(l->pos, r->pos);
+ return __btree_iter_cmp(l->btree_id, l->pos, r);
}
#define __for_each_btree_node(_iter, _c, _btree_id, _start, _depth, \
@@ -207,7 +231,7 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
_k, _locks_want) \
for (__bch_btree_iter_init((_iter), (_c), (_btree_id), \
_start, _locks_want, 0); \
- ((_k) = bch_btree_iter_peek(_iter)).k; \
+ !IS_ERR_OR_NULL(((_k) = bch_btree_iter_peek(_iter)).k); \
bch_btree_iter_advance_pos(_iter))
#define for_each_btree_key(_iter, _c, _btree_id, _start, _k) \
@@ -220,7 +244,7 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
_start, _k, _locks_want) \
for (__bch_btree_iter_init((_iter), (_c), (_btree_id), \
_start, _locks_want, 0); \
- ((_k) = bch_btree_iter_peek_with_holes(_iter)).k; \
+ !IS_ERR_OR_NULL(((_k) = bch_btree_iter_peek_with_holes(_iter)).k);\
bch_btree_iter_advance_pos(_iter))
#define for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k) \
@@ -230,16 +254,27 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
_start, _k) \
__for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 1)
+static inline int btree_iter_err(struct bkey_s_c k)
+{
+ return IS_ERR(k.k) ? PTR_ERR(k.k) : 0;
+}
+
/*
* Unlocks before scheduling
* Note: does not revalidate iterator
*/
static inline void bch_btree_iter_cond_resched(struct btree_iter *iter)
{
+ struct btree_iter *linked;
+
if (need_resched()) {
+ for_each_linked_btree_iter(iter, linked)
+ bch_btree_iter_unlock(linked);
bch_btree_iter_unlock(iter);
schedule();
} else if (race_fault()) {
+ for_each_linked_btree_iter(iter, linked)
+ bch_btree_iter_unlock(linked);
bch_btree_iter_unlock(iter);
}
}
diff --git a/drivers/md/bcache/btree_locking.h b/drivers/md/bcache/btree_locking.h
index fb6ce606eea4..ad0dc3a69b48 100644
--- a/drivers/md/bcache/btree_locking.h
+++ b/drivers/md/bcache/btree_locking.h
@@ -87,62 +87,27 @@ static inline bool btree_want_intent(struct btree_iter *iter, int level)
return btree_lock_want(iter, level) == SIX_LOCK_intent;
}
-static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level,
- struct btree *b)
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
{
- switch (btree_node_locked_type(iter, level)) {
- case BTREE_NODE_READ_LOCKED:
- six_unlock_read(&b->lock);
- break;
- case BTREE_NODE_INTENT_LOCKED:
- six_unlock_intent(&b->lock);
- break;
- }
+ int lock_type = btree_node_locked_type(iter, level);
+ if (lock_type != BTREE_NODE_UNLOCKED)
+ six_unlock_type(&iter->nodes[level]->lock, lock_type);
mark_btree_node_unlocked(iter, level);
}
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
-{
- __btree_node_unlock(iter, level, iter->nodes[level]);
-}
+bool __bch_btree_node_lock(struct btree *, struct bpos, unsigned,
+ struct btree_iter *, enum six_lock_type);
-static inline void btree_node_lock_type(struct btree *b, struct btree_iter *iter,
- enum six_lock_type type)
+static inline bool btree_node_lock(struct btree *b, struct bpos pos,
+ unsigned level,
+ struct btree_iter *iter,
+ enum six_lock_type type)
{
- struct btree_iter *linked;
-
- if (six_trylock_type(&b->lock, type))
- return;
-
- for_each_linked_btree_iter(iter, linked)
- if (linked->nodes[b->level] == b &&
- btree_node_locked_type(linked, b->level) == type) {
- six_lock_increment(&b->lock, type);
- return;
- }
-
- six_lock_type(&b->lock, type);
+ return six_trylock_type(&b->lock, type) ||
+ __bch_btree_node_lock(b, pos, level, iter, type);
}
-#define __btree_node_lock(b, _iter, _level, check_if_raced) \
-({ \
- enum six_lock_type _type = btree_lock_want(_iter, _level); \
- bool _raced; \
- \
- btree_node_lock_type(b, _iter, _type); \
- if ((_raced = ((check_if_raced) || ((b)->level != _level)))) \
- six_unlock_type(&(b)->lock, _type); \
- else \
- mark_btree_node_locked(_iter, _level, _type); \
- \
- !_raced; \
-})
-
-#define btree_node_lock(b, iter, level, check_if_raced) \
- (!race_fault() && \
- __btree_node_lock(b, iter, level, check_if_raced))
-
bool btree_node_relock(struct btree_iter *, unsigned);
void btree_node_unlock_write(struct btree *, struct btree_iter *);
diff --git a/drivers/md/bcache/btree_types.h b/drivers/md/bcache/btree_types.h
index 9b6cac235ae6..d4bfd711ac65 100644
--- a/drivers/md/bcache/btree_types.h
+++ b/drivers/md/bcache/btree_types.h
@@ -46,6 +46,12 @@ struct btree {
u16 written;
u8 level;
u8 btree_id;
+ /*
+ * XXX: add a delete sequence number, so when btree_node_relock() fails
+ * because the lock sequence number has changed - i.e. the contents were
+ * modified - we can still relock the node if it's still the one we
+ * want, without redoing the traversal
+ */
/*
* For asynchronous splits/interior node updates:
diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c
index 7aef5cd9f9d9..abb94a4fcf79 100644
--- a/drivers/md/bcache/btree_update.c
+++ b/drivers/md/bcache/btree_update.c
@@ -251,7 +251,7 @@ retry:
goto retry;
}
mem_alloc:
- b = mca_alloc(c, NULL);
+ b = mca_alloc(c);
/* we hold cannibalize_lock: */
BUG_ON(IS_ERR(b));
@@ -1439,7 +1439,6 @@ void bch_btree_insert_node(struct btree *b,
static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags,
struct closure *cl)
{
- struct btree_iter *linked;
struct cache_set *c = iter->c;
struct btree *b = iter->nodes[0];
struct btree_reserve *reserve;
@@ -1456,18 +1455,16 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags,
* XXX: figure out how far we might need to split,
* instead of locking/reserving all the way to the root:
*/
- iter->locks_want = U8_MAX;
-
- if (!bch_btree_iter_upgrade(iter)) {
+ if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) {
ret = -EINTR;
- goto out_get_locks;
+ goto out;
}
reserve = bch_btree_reserve_get(c, b, 0,
!(flags & BTREE_INSERT_NOFAIL), cl);
if (IS_ERR(reserve)) {
ret = PTR_ERR(reserve);
- goto out_get_locks;
+ goto out;
}
as = bch_btree_interior_update_alloc(c);
@@ -1475,29 +1472,10 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags,
btree_split(b, iter, NULL, reserve, as);
bch_btree_reserve_put(c, reserve);
- iter->locks_want = 1;
-
- for_each_linked_btree_iter(iter, linked)
- if (linked->btree_id == iter->btree_id &&
- btree_iter_cmp(linked, iter) <= 0)
- linked->locks_want = 1;
+ bch_btree_iter_set_locks_want(iter, 1);
out:
up_read(&c->gc_lock);
return ret;
-out_get_locks:
- /* Lock ordering... */
- for_each_linked_btree_iter(iter, linked)
- if (linked->btree_id == iter->btree_id &&
- btree_iter_cmp(linked, iter) <= 0) {
- unsigned i;
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++) {
- btree_node_unlock(linked, i);
- linked->lock_seq[i]--;
- }
- linked->locks_want = U8_MAX;
- }
- goto out;
}
/**
@@ -1606,14 +1584,11 @@ int __bch_btree_insert_at(struct btree_insert *trans, u64 *journal_seq)
if (unlikely(!percpu_ref_tryget(&c->writes)))
return -EROFS;
-
- trans_for_each_entry(trans, i) {
- i->iter->locks_want = max_t(int, i->iter->locks_want, 1);
- if (unlikely(!bch_btree_iter_upgrade(i->iter))) {
- ret = -EINTR;
+retry_locks:
+ ret = -EINTR;
+ trans_for_each_entry(trans, i)
+ if (!bch_btree_iter_set_locks_want(i->iter, 1))
goto err;
- }
- }
retry:
trans->did_work = false;
u64s = 0;
@@ -1694,6 +1669,11 @@ unlock:
if (!same_leaf_as_prev(trans, i))
bch_btree_node_write_lazy(i->iter->nodes[0], i->iter);
out:
+ /* make sure we didn't lose an error: */
+ if (!ret && IS_ENABLED(CONFIG_BCACHE_DEBUG))
+ trans_for_each_entry(trans, i)
+ BUG_ON(!i->done);
+
percpu_ref_put(&c->writes);
return ret;
split:
@@ -1705,21 +1685,15 @@ split:
ret = bch_btree_split_leaf(split, trans->flags, &cl);
if (ret)
goto err;
-
/*
* if the split didn't have to drop locks the insert will still be
* atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
* and is overwriting won't have changed)
*/
- goto retry;
+ goto retry_locks;
err:
if (ret == -EAGAIN) {
- struct btree_iter *linked;
-
- for_each_linked_btree_iter(split, linked)
- bch_btree_iter_unlock(linked);
bch_btree_iter_unlock(split);
-
closure_sync(&cl);
ret = -EINTR;
}
@@ -1729,28 +1703,23 @@ err:
up_read(&c->gc_lock);
}
- /*
- * Main rule is, BTREE_INSERT_ATOMIC means we can't call
- * bch_btree_iter_traverse(), because if we have to we either dropped
- * locks or we need a different btree node (different than the one the
- * caller was looking at).
- *
- * BTREE_INSERT_ATOMIC doesn't mean anything w.r.t. journal
- * reservations:
- */
- if (ret == -EINTR && !(trans->flags & BTREE_INSERT_ATOMIC)) {
+ if (ret == -EINTR) {
trans_for_each_entry(trans, i) {
- ret = bch_btree_iter_traverse(i->iter);
- if (ret)
+ int ret2 = bch_btree_iter_traverse(i->iter);
+ if (ret2) {
+ ret = ret2;
goto out;
+ }
}
- ret = 0;
+ /*
+ * BTREE_ITER_ATOMIC means we have to return -EINTR if we
+ * dropped locks:
+ */
+ if (!(trans->flags & BTREE_INSERT_ATOMIC))
+ goto retry;
}
- if (!ret)
- goto retry;
-
goto out;
}
@@ -1859,22 +1828,26 @@ int bch_btree_update(struct cache_set *c, enum btree_id id,
{
struct btree_iter iter;
struct bkey_s_c u;
- int ret, ret2;
+ int ret;
EBUG_ON(id == BTREE_ID_EXTENTS);
bch_btree_iter_init_intent(&iter, c, id, k->k.p);
u = bch_btree_iter_peek_with_holes(&iter);
+ ret = btree_iter_err(u);
+ if (ret)
+ return ret;
- if (!u.k || bkey_deleted(u.k))
+ if (bkey_deleted(u.k)) {
+ bch_btree_iter_unlock(&iter);
return -ENOENT;
+ }
ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, 0,
BTREE_INSERT_ENTRY(&iter, k));
- ret2 = bch_btree_iter_unlock(&iter);
-
- return ret ?: ret2;
+ bch_btree_iter_unlock(&iter);
+ return ret;
}
/*
@@ -1896,7 +1869,8 @@ int bch_btree_delete_range(struct cache_set *c, enum btree_id id,
bch_btree_iter_init_intent(&iter, c, id, start);
- while ((k = bch_btree_iter_peek(&iter)).k) {
+ while ((k = bch_btree_iter_peek(&iter)).k &&
+ !(ret = btree_iter_err(k))) {
unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
/* really shouldn't be using a bare, unpadded bkey_i */
struct bkey_i delete;
@@ -1943,7 +1917,8 @@ int bch_btree_delete_range(struct cache_set *c, enum btree_id id,
bch_btree_iter_cond_resched(&iter);
}
- return bch_btree_iter_unlock(&iter) ?: ret;
+ bch_btree_iter_unlock(&iter);
+ return ret;
}
/**
@@ -1960,11 +1935,10 @@ int bch_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
struct btree_reserve *reserve;
struct btree_interior_update *as;
- iter->locks_want = U8_MAX;
- if (!bch_btree_iter_upgrade(iter))
+ if (!bch_btree_iter_set_locks_want(iter, U8_MAX))
return -EINTR;
- reserve = bch_btree_reserve_get(c, b, 1, false, cl);
+ reserve = bch_btree_reserve_get(c, b, 0, false, cl);
if (IS_ERR(reserve)) {
trace_bcache_btree_gc_rewrite_node_fail(b);
return PTR_ERR(reserve);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 3ada6d9c4e56..4396e792947e 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -265,7 +265,8 @@ static ssize_t bch_read_btree(struct file *file, char __user *buf,
bch_btree_iter_init(&iter, i->c, i->id, i->from);
- while ((k = bch_btree_iter_peek(&iter)).k) {
+ while ((k = bch_btree_iter_peek(&iter)).k &&
+ !(err = btree_iter_err(k))) {
bch_bkey_val_to_text(i->c, bkey_type(0, i->id),
i->buf, sizeof(i->buf), k);
i->bytes = strlen(i->buf);
@@ -408,7 +409,8 @@ static ssize_t bch_read_bfloat_failed(struct file *file, char __user *buf,
bch_btree_iter_init(&iter, i->c, i->id, i->from);
- while ((k = bch_btree_iter_peek(&iter)).k) {
+ while ((k = bch_btree_iter_peek(&iter)).k &&
+ !(err = btree_iter_err(k))) {
struct btree_keys *b = &iter.nodes[0]->keys;
struct btree_node_iter *node_iter = &iter.node_iters[0];
struct bkey_packed *_k = bch_btree_node_iter_peek(node_iter, b);
diff --git a/drivers/md/bcache/dirent.c b/drivers/md/bcache/dirent.c
index 47b4a365a7d8..68fe648ca64a 100644
--- a/drivers/md/bcache/dirent.c
+++ b/drivers/md/bcache/dirent.c
@@ -222,6 +222,13 @@ int bch_dirent_rename(struct cache_set *c,
bool need_whiteout;
int ret = -ENOMEM;
+ bch_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos);
+ bch_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos);
+ bch_btree_iter_link(&src_iter, &dst_iter);
+
+ bch_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos);
+ bch_btree_iter_link(&src_iter, &whiteout_iter);
+
if (mode == BCH_RENAME_EXCHANGE) {
new_src = dirent_create_key(0, src_name, 0);
if (!new_src)
@@ -233,146 +240,112 @@ int bch_dirent_rename(struct cache_set *c,
new_dst = dirent_create_key(0, dst_name, 0);
if (!new_dst)
goto err;
+retry:
+ /*
+ * Note that on -EINTR/dropped locks we're not restarting the lookup
+ * from the original hashed position (like we do when creating dirents,
+ * in bch_hash_set) - we never move existing dirents to different slot:
+ */
+ old_src = bch_hash_lookup_at(dirent_hash_desc,
+ &src_ei->str_hash,
+ &src_iter, src_name);
+ if ((ret = btree_iter_err(old_src)))
+ goto err;
- bch_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos);
- bch_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos);
- bch_btree_iter_link(&src_iter, &dst_iter);
-
- do {
- /*
- * Have to traverse lower btree nodes before higher - due to
- * lock ordering.
- */
- if (bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
- old_src = bch_hash_lookup_at(dirent_hash_desc,
- &src_ei->str_hash,
- &src_iter, src_name);
+ ret = bch_hash_needs_whiteout(dirent_hash_desc,
+ &src_ei->str_hash,
+ &whiteout_iter, &src_iter);
+ if (ret < 0)
+ goto err;
+ need_whiteout = ret;
+
+ /*
+ * Note that in BCH_RENAME mode, we're _not_ checking if
+ * the target already exists - we're relying on the VFS
+ * to do that check for us for correctness:
+ */
+ old_dst = mode == BCH_RENAME
+ ? bch_hash_hole_at(dirent_hash_desc, &dst_iter)
+ : bch_hash_lookup_at(dirent_hash_desc,
+ &dst_ei->str_hash,
+ &dst_iter, dst_name);
+ if ((ret = btree_iter_err(old_dst)))
+ goto err;
- need_whiteout = bch_hash_needs_whiteout(dirent_hash_desc,
- &src_ei->str_hash,
- &whiteout_iter, &src_iter);
+ switch (mode) {
+ case BCH_RENAME:
+ bkey_init(&new_src->k);
+ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+ if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
+ bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
/*
- * Note that in BCH_RENAME mode, we're _not_ checking if
- * the target already exists - we're relying on the VFS
- * to do that check for us for correctness:
+ * If we couldn't insert new_dst at its hashed
+ * position (dst_pos) due to a hash collision,
+ * and we're going to be deleting in
+ * between the hashed position and first empty
+ * slot we found - just overwrite the pos we
+ * were going to delete:
+ *
+ * Note: this is a correctness issue, in this
+ * situation bch_hash_needs_whiteout() could
+ * return false when the whiteout would have
+ * been needed if we inserted at the pos
+ * __dirent_find_hole() found
*/
- old_dst = mode == BCH_RENAME
- ? bch_hash_hole_at(dirent_hash_desc, &dst_iter)
- : bch_hash_lookup_at(dirent_hash_desc,
- &dst_ei->str_hash,
- &dst_iter, dst_name);
- } else {
- old_dst = mode == BCH_RENAME
- ? bch_hash_hole_at(dirent_hash_desc, &dst_iter)
- : bch_hash_lookup_at(dirent_hash_desc,
- &dst_ei->str_hash,
- &dst_iter, dst_name);
-
- old_src = bch_hash_lookup_at(dirent_hash_desc,
- &src_ei->str_hash,
- &src_iter, src_name);
-
- need_whiteout = bch_hash_needs_whiteout(dirent_hash_desc,
- &src_ei->str_hash,
- &whiteout_iter, &src_iter);
- }
-
- if (IS_ERR(old_src.k)) {
- ret = PTR_ERR(old_src.k);
- goto err_unlock;
+ new_dst->k.p = src_iter.pos;
+ ret = bch_btree_insert_at(c, NULL, NULL,
+ journal_seq,
+ BTREE_INSERT_ATOMIC,
+ BTREE_INSERT_ENTRY(&src_iter,
+ &new_dst->k_i));
+ goto err;
}
- if (IS_ERR(old_dst.k)) {
- ret = PTR_ERR(old_dst.k);
- goto err_unlock;
- }
+ if (need_whiteout)
+ new_src->k.type = BCH_DIRENT_WHITEOUT;
+ break;
+ case BCH_RENAME_OVERWRITE:
+ bkey_init(&new_src->k);
+ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
- switch (mode) {
- case BCH_RENAME:
- bkey_init(&new_src->k);
- dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-
- if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
- bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
- /*
- * If we couldn't insert new_dst at its hashed
- * position (dst_pos) due to a hash collision,
- * and we're going to be deleting in
- * between the hashed position and first empty
- * slot we found - just overwrite the pos we
- * were going to delete:
- *
- * Note: this is a correctness issue, in this
- * situation bch_hash_needs_whiteout() could
- * return false when the whiteout would have
- * been needed if we inserted at the pos
- * __dirent_find_hole() found
- */
- new_dst->k.p = src_iter.pos;
- ret = bch_btree_insert_at(c, NULL, NULL,
- journal_seq,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&src_iter,
- &new_dst->k_i));
- goto insert_done;
- }
-
- if (need_whiteout)
- new_src->k.type = BCH_DIRENT_WHITEOUT;
- break;
- case BCH_RENAME_OVERWRITE:
- bkey_init(&new_src->k);
- dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-
- if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
- bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
- /*
- * Same case described above -
- * bch_hash_needs_whiteout could spuriously
- * return false, but we have to insert at
- * dst_iter.pos because we're overwriting
- * another dirent:
- */
- new_src->k.type = BCH_DIRENT_WHITEOUT;
- } else if (need_whiteout)
- new_src->k.type = BCH_DIRENT_WHITEOUT;
- break;
- case BCH_RENAME_EXCHANGE:
- dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
- dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
- break;
- }
+ if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
+ bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
+ /*
+ * Same case described above -
+ * bch_hash_needs_whiteout could spuriously
+ * return false, but we have to insert at
+ * dst_iter.pos because we're overwriting
+ * another dirent:
+ */
+ new_src->k.type = BCH_DIRENT_WHITEOUT;
+ } else if (need_whiteout)
+ new_src->k.type = BCH_DIRENT_WHITEOUT;
+ break;
+ case BCH_RENAME_EXCHANGE:
+ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
+ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+ break;
+ }
- new_src->k.p = src_iter.pos;
- new_dst->k.p = dst_iter.pos;
- ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i),
- BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i));
-insert_done:
- bch_btree_iter_unlink(&whiteout_iter);
- bch_btree_iter_unlock(&src_iter);
- bch_btree_iter_unlock(&dst_iter);
-
- if (bkey_cmp(src_iter.pos, src_pos) ||
- bkey_cmp(dst_iter.pos, dst_pos)) {
- /* ugh */
- bch_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos);
- bch_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos);
- bch_btree_iter_link(&src_iter, &dst_iter);
- }
- } while (ret == -EINTR);
+ new_src->k.p = src_iter.pos;
+ new_dst->k.p = dst_iter.pos;
+ ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
+ BTREE_INSERT_ATOMIC,
+ BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i),
+ BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i));
err:
+ if (ret == -EINTR)
+ goto retry;
+
+ bch_btree_iter_unlock(&whiteout_iter);
+ bch_btree_iter_unlock(&dst_iter);
+ bch_btree_iter_unlock(&src_iter);
+
if (new_src != (void *) &delete)
kfree(new_src);
kfree(new_dst);
return ret;
-err_unlock:
- bch_btree_iter_unlink(&whiteout_iter);
- ret = bch_btree_iter_unlock(&src_iter) ?: ret;
- ret = bch_btree_iter_unlock(&dst_iter) ?: ret;
- goto err;
}
int bch_dirent_delete(struct inode *dir, const struct qstr *name)
diff --git a/drivers/md/bcache/fs-gc.c b/drivers/md/bcache/fs-gc.c
index d4a7c75a85a0..11c3657d22f8 100644
--- a/drivers/md/bcache/fs-gc.c
+++ b/drivers/md/bcache/fs-gc.c
@@ -281,8 +281,8 @@ static int bch_gc_walk_inodes(struct cache_set *c, struct nlinks *links,
bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0));
genradix_iter_init(&nlinks_iter);
- while (1) {
- k = bch_btree_iter_peek(&iter);
+ while ((k = bch_btree_iter_peek(&iter)).k &&
+ !btree_iter_err(k)) {
peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links);
if (!link && (!k.k || iter.pos.inode >= range_end))
diff --git a/drivers/md/bcache/fs-io.c b/drivers/md/bcache/fs-io.c
index 74d7578c9974..92d7a9bf8dcd 100644
--- a/drivers/md/bcache/fs-io.c
+++ b/drivers/md/bcache/fs-io.c
@@ -269,45 +269,41 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
struct keylist *keys = &op->op.insert_keys;
struct btree_iter extent_iter, inode_iter;
struct bchfs_extent_trans_hook hook;
+ struct bkey_i *k = bch_keylist_front(keys);
int ret;
- BUG_ON(bch_keylist_front(keys)->k.p.inode != op->ei->vfs_inode.i_ino);
+ BUG_ON(k->k.p.inode != op->ei->vfs_inode.i_ino);
bch_btree_iter_init_intent(&extent_iter, wop->c, BTREE_ID_EXTENTS,
bkey_start_pos(&bch_keylist_front(keys)->k));
bch_btree_iter_init_intent(&inode_iter, wop->c, BTREE_ID_INODES,
POS(extent_iter.pos.inode, 0));
- bch_btree_iter_link(&extent_iter, &inode_iter);
hook.op = op;
hook.hook.fn = bchfs_extent_update_hook;
hook.need_inode_update = false;
do {
- struct bkey_i *k = bch_keylist_front(keys);
-
- /* lock ordering... */
- bch_btree_iter_unlock(&inode_iter);
-
ret = bch_btree_iter_traverse(&extent_iter);
if (ret)
- break;
+ goto err;
/* XXX: ei->i_size locking */
+ k = bch_keylist_front(keys);
if (min(k->k.p.offset << 9, op->new_i_size) > op->ei->i_size)
hook.need_inode_update = true;
if (hook.need_inode_update) {
struct bkey_s_c inode;
- ret = bch_btree_iter_traverse(&inode_iter);
- if (ret)
- break;
+ if (!btree_iter_linked(&inode_iter))
+ bch_btree_iter_link(&extent_iter, &inode_iter);
inode = bch_btree_iter_peek_with_holes(&inode_iter);
+ if ((ret = btree_iter_err(inode)))
+ goto err;
- if (WARN_ONCE(!inode.k ||
- inode.k->type != BCH_INODE_FS,
+ if (WARN_ONCE(inode.k->type != BCH_INODE_FS,
"inode %llu not found when updating",
extent_iter.pos.inode)) {
ret = -ENOENT;
@@ -327,7 +323,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
BTREE_INSERT_ENTRY(&extent_iter, k));
}
-
+err:
if (ret == -EINTR)
continue;
if (ret)
@@ -2058,16 +2054,13 @@ static long bch_fcollapse(struct inode *inode, loff_t offset, loff_t len)
bch_btree_iter_set_pos(&src,
POS(dst.pos.inode, dst.pos.offset + (len >> 9)));
- /* Have to take intent locks before read locks: */
ret = bch_btree_iter_traverse(&dst);
if (ret)
- goto err_unwind;
+ goto btree_iter_err;
k = bch_btree_iter_peek_with_holes(&src);
- if (!k.k) {
- ret = -EIO;
- goto err_unwind;
- }
+ if ((ret = btree_iter_err(k)))
+ goto btree_iter_err;
bkey_reassemble(&copy.k, k);
@@ -2089,11 +2082,11 @@ static long bch_fcollapse(struct inode *inode, loff_t offset, loff_t len)
BTREE_INSERT_NOFAIL,
BTREE_INSERT_ENTRY(&dst, &copy.k));
bch_disk_reservation_put(c, &disk_res);
-
+btree_iter_err:
if (ret < 0 && ret != -EINTR)
goto err_unwind;
- bch_btree_iter_unlock(&src);
+ bch_btree_iter_cond_resched(&src);
}
bch_btree_iter_unlock(&src);
@@ -2195,10 +2188,8 @@ static long bch_fallocate(struct inode *inode, int mode,
struct disk_reservation disk_res = { 0 };
k = bch_btree_iter_peek_with_holes(&iter);
- if (!k.k) {
- ret = bch_btree_iter_unlock(&iter) ?: -EIO;
- goto err_put_sectors_dirty;
- }
+ if ((ret = btree_iter_err(k)))
+ goto btree_iter_err;
/* already reserved */
if (k.k->type == BCH_RESERVATION) {
@@ -2237,7 +2228,7 @@ static long bch_fallocate(struct inode *inode, int mode,
BTREE_INSERT_NOFAIL,
BTREE_INSERT_ENTRY(&iter, &reservation));
bch_disk_reservation_put(c, &disk_res);
-
+btree_iter_err:
if (ret < 0 && ret != -EINTR)
goto err_put_sectors_dirty;
diff --git a/drivers/md/bcache/fs.c b/drivers/md/bcache/fs.c
index 67a59bf524d9..79e553a86a32 100644
--- a/drivers/md/bcache/fs.c
+++ b/drivers/md/bcache/fs.c
@@ -74,7 +74,10 @@ int __must_check __bch_write_inode(struct cache_set *c,
do {
struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
- if (WARN_ONCE(!k.k || k.k->type != BCH_INODE_FS,
+ if ((ret = btree_iter_err(k)))
+ goto out;
+
+ if (WARN_ONCE(k.k->type != BCH_INODE_FS,
"inode %llu not found when updating", inum)) {
bch_btree_iter_unlock(&iter);
return -ENOENT;
@@ -163,7 +166,8 @@ static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum)
bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0));
k = bch_btree_iter_peek_with_holes(&iter);
- if (!k.k || k.k->type != BCH_INODE_FS) {
+
+ if ((ret = btree_iter_err(k)) || k.k->type != BCH_INODE_FS) {
ret = bch_btree_iter_unlock(&iter);
iget_failed(inode);
return ERR_PTR(ret ?: -ENOENT);
diff --git a/drivers/md/bcache/inode.c b/drivers/md/bcache/inode.c
index 7cbe9b5dc757..9b2a4806805e 100644
--- a/drivers/md/bcache/inode.c
+++ b/drivers/md/bcache/inode.c
@@ -112,7 +112,6 @@ int bch_inode_create(struct cache_set *c, struct bkey_i *inode,
u64 min, u64 max, u64 *hint)
{
struct btree_iter iter;
- struct bkey_s_c k;
bool searched_from_start = false;
int ret;
@@ -130,9 +129,14 @@ int bch_inode_create(struct cache_set *c, struct bkey_i *inode,
again:
bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(*hint, 0));
- while ((k = bch_btree_iter_peek_with_holes(&iter)).k) {
- if (k.k->p.inode >= max)
- break;
+ while (1) {
+ struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
+
+ ret = btree_iter_err(k);
+ if (ret) {
+ bch_btree_iter_unlock(&iter);
+ return ret;
+ }
if (k.k->type < BCH_INODE_FS) {
inode->k.p = k.k->p;
@@ -153,6 +157,8 @@ again:
return ret;
} else {
+ if (iter.pos.inode == max)
+ break;
/* slot used */
bch_btree_iter_advance_pos(&iter);
}
diff --git a/drivers/md/bcache/migrate.c b/drivers/md/bcache/migrate.c
index 363d2b02282a..5a26e2286504 100644
--- a/drivers/md/bcache/migrate.c
+++ b/drivers/md/bcache/migrate.c
@@ -95,7 +95,8 @@ int bch_move_data_off_device(struct cache *ca)
bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
while (!bch_move_ctxt_wait(&ctxt) &&
- (k = bch_btree_iter_peek(&iter)).k) {
+ (k = bch_btree_iter_peek(&iter)).k &&
+ !(ret = btree_iter_err(k))) {
if (!bkey_extent_is_data(k.k) ||
!bch_extent_has_device(bkey_s_c_to_extent(k),
ca->sb.nr_this_dev))
@@ -112,11 +113,8 @@ int bch_move_data_off_device(struct cache *ca)
bch_move_ctxt_wait_for_io(&ctxt);
continue;
}
- if (ret == -ENOSPC) {
- bch_btree_iter_unlock(&iter);
- bch_move_ctxt_exit(&ctxt);
- return -ENOSPC;
- }
+ if (ret == -ENOSPC)
+ break;
BUG_ON(ret);
seen_key_count++;
@@ -125,7 +123,7 @@ next:
bch_btree_iter_cond_resched(&iter);
}
- ret = bch_btree_iter_unlock(&iter);
+ bch_btree_iter_unlock(&iter);
bch_move_ctxt_exit(&ctxt);
if (ret)
@@ -180,7 +178,7 @@ retry:
return ret;
}
- iter.locks_want = 0;
+ bch_btree_iter_set_locks_want(&iter, 0);
}
ret = bch_btree_iter_unlock(&iter);
if (ret)
@@ -312,14 +310,15 @@ static int bch_flag_key_bad(struct btree_iter *iter,
int bch_flag_data_bad(struct cache *ca)
{
- int ret = 0, ret2;
+ int ret = 0;
struct bkey_s_c k;
struct bkey_s_c_extent e;
struct btree_iter iter;
bch_btree_iter_init(&iter, ca->set, BTREE_ID_EXTENTS, POS_MIN);
- while ((k = bch_btree_iter_peek(&iter)).k) {
+ while ((k = bch_btree_iter_peek(&iter)).k &&
+ !(ret = btree_iter_err(k))) {
if (!bkey_extent_is_data(k.k))
goto advance;
@@ -364,7 +363,7 @@ advance:
bch_btree_iter_advance_pos(&iter);
}
- ret2 = bch_btree_iter_unlock(&iter);
+ bch_btree_iter_unlock(&iter);
- return ret ?: ret2;
+ return ret;
}
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index d93adbd2e609..27f5d572107c 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -78,7 +78,8 @@ static void read_moving(struct cache *ca, size_t buckets_to_move)
bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
while (!bch_move_ctxt_wait(&ctxt) &&
- (k = bch_btree_iter_peek(&iter)).k) {
+ (k = bch_btree_iter_peek(&iter)).k &&
+ !btree_iter_err(k)) {
if (!moving_pred(ca, k))
goto next;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 71883352abd8..c8ed80f58bf8 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -508,6 +508,8 @@ retry:
/* not present (hole), or stale cached data */
if (cached_dev_cache_miss(&iter, s, bio, sectors)) {
k = bch_btree_iter_peek_with_holes(&iter);
+ if (btree_iter_err(k))
+ break;
goto retry;
}
}
diff --git a/drivers/md/bcache/str_hash.h b/drivers/md/bcache/str_hash.h
index f6a441b7be00..9a718a8e9eaa 100644
--- a/drivers/md/bcache/str_hash.h
+++ b/drivers/md/bcache/str_hash.h
@@ -95,12 +95,13 @@ bch_hash_lookup_at(const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct btree_iter *iter, const void *search)
{
- struct bkey_s_c k;
u64 inode = iter->pos.inode;
- while ((k = bch_btree_iter_peek_with_holes(iter)).k) {
- if (k.k->p.inode != inode)
- break;
+ do {
+ struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
+
+ if (btree_iter_err(k))
+ return k;
if (k.k->type == desc.key_type) {
if (!desc.cmp_key(k, search))
@@ -113,7 +114,7 @@ bch_hash_lookup_at(const struct bch_hash_desc desc,
}
bch_btree_iter_advance_pos(iter);
- }
+ } while (iter->pos.inode == inode);
return bkey_s_c_err(-ENOENT);
}
@@ -123,12 +124,13 @@ bch_hash_lookup_bkey_at(const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct btree_iter *iter, struct bkey_s_c search)
{
- struct bkey_s_c k;
u64 inode = iter->pos.inode;
- while ((k = bch_btree_iter_peek_with_holes(iter)).k) {
- if (k.k->p.inode != inode)
- break;
+ do {
+ struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
+
+ if (btree_iter_err(k))
+ return k;
if (k.k->type == desc.key_type) {
if (!desc.cmp_bkey(k, search))
@@ -141,7 +143,7 @@ bch_hash_lookup_bkey_at(const struct bch_hash_desc desc,
}
bch_btree_iter_advance_pos(iter);
- }
+ } while (iter->pos.inode == inode);
return bkey_s_c_err(-ENOENT);
}
@@ -173,21 +175,20 @@ bch_hash_lookup_intent(const struct bch_hash_desc desc,
static inline struct bkey_s_c
bch_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter)
{
- struct bkey_s_c k;
- u64 inode = iter->pos.inode;
+ while (1) {
+ struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
- while ((k = bch_btree_iter_peek_with_holes(iter)).k) {
- if (k.k->p.inode != inode)
- break;
+ if (btree_iter_err(k))
+ return k;
if (k.k->type != desc.key_type)
return k;
/* hash collision, keep going */
bch_btree_iter_advance_pos(iter);
+ if (iter->pos.inode != k.k->p.inode)
+ return bkey_s_c_err(-ENOENT);
}
-
- return bkey_s_c_err(-ENOSPC);
}
static inline struct bkey_s_c bch_hash_hole(const struct bch_hash_desc desc,
@@ -202,21 +203,20 @@ static inline struct bkey_s_c bch_hash_hole(const struct bch_hash_desc desc,
return bch_hash_hole_at(desc, iter);
}
-static inline bool bch_hash_needs_whiteout(const struct bch_hash_desc desc,
+static inline int bch_hash_needs_whiteout(const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct btree_iter *iter,
struct btree_iter *start)
{
- struct bkey_s_c k;
-
- bch_btree_iter_init_copy(iter, start);
+ bch_btree_iter_set_pos(iter,
+ btree_type_successor(start->btree_id, start->pos));
while (1) {
- bch_btree_iter_advance_pos(iter);
- k = bch_btree_iter_peek_with_holes(iter);
+ struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
+ int ret = btree_iter_err(k);
- if (!k.k)
- return iter->error ? true : false;
+ if (ret)
+ return ret;
if (k.k->type != desc.key_type &&
k.k->type != desc.whiteout_type)
@@ -225,6 +225,8 @@ static inline bool bch_hash_needs_whiteout(const struct bch_hash_desc desc,
if (k.k->type == desc.key_type &&
desc.hash_bkey(info, k) <= start->pos.offset)
return true;
+
+ bch_btree_iter_advance_pos(iter);
}
}
@@ -245,54 +247,57 @@ static inline int bch_hash_set(const struct bch_hash_desc desc,
POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))));
bch_btree_iter_init_intent(&iter, c, desc.btree_id, hashed_slot.pos);
bch_btree_iter_link(&hashed_slot, &iter);
+retry:
+ /*
+ * On hash collision, we have to keep the slot we hashed to locked while
+ * we do the insert - to avoid racing with another thread deleting
+ * whatever's in the slot we hashed to:
+ */
+ ret = bch_btree_iter_traverse(&hashed_slot);
+ if (ret)
+ goto err;
- do {
- ret = bch_btree_iter_traverse(&hashed_slot);
- if (ret)
- break;
+ /*
+ * On -EINTR/retry, we dropped locks - always restart from the slot we
+ * hashed to:
+ */
+ bch_btree_iter_copy(&iter, &hashed_slot);
+
+ k = bch_hash_lookup_bkey_at(desc, info, &iter, bkey_i_to_s_c(insert));
+
+ ret = btree_iter_err(k);
+ if (ret == -ENOENT) {
+ if (flags & BCH_HASH_SET_MUST_REPLACE) {
+ ret = -ENOENT;
+ goto err;
+ }
/*
- * If there's a hash collision and we have to insert at a
- * different slot, on -EINTR (insert had to drop locks) we have
- * to recheck the slot we hashed to - it could have been deleted
- * while we dropped locks:
+ * Not found, so we're now looking for any open
+ * slot - we might have skipped over a whiteout
+ * that we could have used, so restart from the
+ * slot we hashed to:
*/
bch_btree_iter_copy(&iter, &hashed_slot);
- k = bch_hash_lookup_bkey_at(desc, info, &iter,
- bkey_i_to_s_c(insert));
- if (IS_ERR(k.k)) {
- if (flags & BCH_HASH_SET_MUST_REPLACE) {
- ret = -ENOENT;
- goto err;
- }
-
- /*
- * Not found, so we're now looking for any open
- * slot - we might have skipped over a whiteout
- * that we could have used, so restart from the
- * slot we hashed to:
- */
- bch_btree_iter_copy(&iter, &hashed_slot);
- k = bch_hash_hole_at(desc, &iter);
- if (IS_ERR(k.k)) {
- ret = PTR_ERR(k.k);
- goto err;
- }
- } else {
- if (flags & BCH_HASH_SET_MUST_CREATE) {
- ret = -EEXIST;
- goto err;
- }
+ k = bch_hash_hole_at(desc, &iter);
+ if ((ret = btree_iter_err(k)))
+ goto err;
+ } else if (!ret) {
+ if (flags & BCH_HASH_SET_MUST_CREATE) {
+ ret = -EEXIST;
+ goto err;
}
+ } else {
+ goto err;
+ }
- insert->k.p = iter.pos;
- ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&iter, insert));
-
- /* unlock before traversing hashed_slot: */
- bch_btree_iter_unlock(&iter);
- } while (ret == -EINTR);
+ insert->k.p = iter.pos;
+ ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
+ BTREE_INSERT_ATOMIC,
+ BTREE_INSERT_ENTRY(&iter, insert));
+err:
+ if (ret == -EINTR)
+ goto retry;
/*
* On successful insert, we don't want to clobber ret with error from
@@ -301,10 +306,6 @@ static inline int bch_hash_set(const struct bch_hash_desc desc,
bch_btree_iter_unlock(&iter);
bch_btree_iter_unlock(&hashed_slot);
return ret;
-err:
- ret = bch_btree_iter_unlock(&iter) ?: ret;
- ret = bch_btree_iter_unlock(&hashed_slot) ?: ret;
- return ret;
}
static inline int bch_hash_delete(const struct bch_hash_desc desc,
@@ -319,34 +320,31 @@ static inline int bch_hash_delete(const struct bch_hash_desc desc,
bch_btree_iter_init_intent(&iter, c, desc.btree_id,
POS(inode, desc.hash_key(info, key)));
+ bch_btree_iter_init(&whiteout_iter, c, desc.btree_id,
+ POS(inode, desc.hash_key(info, key)));
+ bch_btree_iter_link(&iter, &whiteout_iter);
+retry:
+ k = bch_hash_lookup_at(desc, info, &iter, key);
+ if ((ret = btree_iter_err(k)))
+ goto err;
+
+ ret = bch_hash_needs_whiteout(desc, info, &whiteout_iter, &iter);
+ if (ret < 0)
+ goto err;
+
+ bkey_init(&delete.k);
+ delete.k.p = k.k->p;
+ delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
+
+ ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_ATOMIC,
+ BTREE_INSERT_ENTRY(&iter, &delete));
+err:
+ if (ret == -EINTR)
+ goto retry;
- do {
- k = bch_hash_lookup_at(desc, info, &iter, key);
- if (IS_ERR(k.k))
- return bch_btree_iter_unlock(&iter) ?: -ENOENT;
-
- bkey_init(&delete.k);
- delete.k.p = k.k->p;
- delete.k.type = bch_hash_needs_whiteout(desc, info,
- &whiteout_iter, &iter)
- ? desc.whiteout_type
- : KEY_TYPE_DELETED;
-
- ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&iter, &delete));
- /*
- * Need to hold whiteout iter locked while we do the delete, if
- * we're not leaving a whiteout:
- */
- bch_btree_iter_unlink(&whiteout_iter);
- /*
- * XXX: if we ever cleanup whiteouts, we may need to rewind
- * iterator on -EINTR
- */
- } while (ret == -EINTR);
-
+ bch_btree_iter_unlock(&whiteout_iter);
bch_btree_iter_unlock(&iter);
return ret;
}
diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c
index baecaf4c3aa7..bbace2f1a6b1 100644
--- a/drivers/md/bcache/tier.c
+++ b/drivers/md/bcache/tier.c
@@ -127,7 +127,8 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
while (!bch_move_ctxt_wait(&ctxt) &&
- (k = bch_btree_iter_peek(&iter)).k) {
+ (k = bch_btree_iter_peek(&iter)).k &&
+ !btree_iter_err(k)) {
if (!tiering_pred(c, &s, k))
goto next;
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index e3c62520159e..6397b0f0ae69 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -542,35 +542,33 @@ TRACE_EVENT(bcache_mca_scan,
);
DECLARE_EVENT_CLASS(mca_cannibalize_lock,
- TP_PROTO(struct cache_set *c, struct closure *cl),
- TP_ARGS(c, cl),
+ TP_PROTO(struct cache_set *c),
+ TP_ARGS(c),
TP_STRUCT__entry(
__array(char, uuid, 16 )
- __field(struct closure *, cl )
),
TP_fast_assign(
memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
- __entry->cl = cl;
),
- TP_printk("%pU cl %p", __entry->uuid, __entry->cl)
+ TP_printk("%pU", __entry->uuid)
);
DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize_lock_fail,
- TP_PROTO(struct cache_set *c, struct closure *cl),
- TP_ARGS(c, cl)
+ TP_PROTO(struct cache_set *c),
+ TP_ARGS(c)
);
DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize_lock,
- TP_PROTO(struct cache_set *c, struct closure *cl),
- TP_ARGS(c, cl)
+ TP_PROTO(struct cache_set *c),
+ TP_ARGS(c)
);
DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize,
- TP_PROTO(struct cache_set *c, struct closure *cl),
- TP_ARGS(c, cl)
+ TP_PROTO(struct cache_set *c),
+ TP_ARGS(c)
);
DEFINE_EVENT(cache_set, bcache_mca_cannibalize_unlock,
@@ -578,46 +576,6 @@ DEFINE_EVENT(cache_set, bcache_mca_cannibalize_unlock,
TP_ARGS(c)
);
-DECLARE_EVENT_CLASS(btree_node_op,
- TP_PROTO(struct btree *b, void *op),
- TP_ARGS(b, op),
-
- TP_STRUCT__entry(
- __array(char, uuid, 16 )
- __field(u64, bucket )
- __field(u8, level )
- __field(u8, id )
- __field(void *, op )
- ),
-
- TP_fast_assign(
- memcpy(__entry->uuid, b->c->disk_sb.user_uuid.b, 16);
- __entry->bucket = PTR_BUCKET_NR_TRACE(b->c, &b->key, 0);
- __entry->level = b->level;
- __entry->id = b->btree_id;
- __entry->op = op;
- ),
-
- TP_printk("%pU bucket %llu(%u) id %u op %p",
- __entry->uuid, __entry->bucket, __entry->level, __entry->id,
- __entry->op)
-);
-
-DEFINE_EVENT(btree_node_op, bcache_btree_upgrade_lock,
- TP_PROTO(struct btree *b, void *op),
- TP_ARGS(b, op)
-);
-
-DEFINE_EVENT(btree_node_op, bcache_btree_upgrade_lock_fail,
- TP_PROTO(struct btree *b, void *op),
- TP_ARGS(b, op)
-);
-
-DEFINE_EVENT(btree_node_op, bcache_btree_intent_lock_fail,
- TP_PROTO(struct btree *b, void *op),
- TP_ARGS(b, op)
-);
-
TRACE_EVENT(bcache_btree_insert_key,
TP_PROTO(struct btree *b, struct bkey_i *k),
TP_ARGS(b, k),
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 1c5111d4d5a6..01323c95024d 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -1092,7 +1092,7 @@ enum btree_id {
#undef DEF_BTREE_ID
-#define BTREE_MAX_DEPTH 4
+#define BTREE_MAX_DEPTH 4U
/* Btree nodes */