summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-11-06 02:55:57 -0900
committerKent Overstreet <kent.overstreet@gmail.com>2017-01-18 21:40:55 -0900
commit486b6304937fb3f51fe3a36c5d1b3ae117bd0edd (patch)
tree257341fe631c091b4d2d2daa5ccd97abed9882eb
parenta8c56da481e4f6c6235e5225a3eea4c151f2195a (diff)
bcache: btree_split() refactoring
-rw-r--r--drivers/md/bcache/btree_update.c132
-rw-r--r--drivers/md/bcache/btree_update.h2
2 files changed, 55 insertions, 79 deletions
diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c
index 41884f8fc83e..b621f411147d 100644
--- a/drivers/md/bcache/btree_update.c
+++ b/drivers/md/bcache/btree_update.c
@@ -1218,8 +1218,6 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
n2->keys.set->size = 0;
n2->keys.set->extra = BSET_AUX_TREE_NONE_VAL;
- six_unlock_write(&n2->lock);
-
bch_verify_btree_nr_keys(&n1->keys);
bch_verify_btree_nr_keys(&n2->keys);
@@ -1231,38 +1229,61 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
return n2;
}
+/*
+ * For updates to interior nodes, we've got to do the insert before we split
+ * because the stuff we're inserting has to be inserted atomically. Post split,
+ * the keys might have to go in different nodes and the split would no longer be
+ * atomic.
+ *
+ * Worse, if the insert is from btree node coalescing, if we do the insert after
+ * we do the split (and pick the pivot) - the pivot we pick might be between
+ * nodes that were coalesced, and thus in the middle of a child node post
+ * coalescing:
+ */
static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b,
struct keylist *keys,
- struct btree_reserve *res,
- bool is_last)
+ struct btree_reserve *res)
{
struct btree_node_iter node_iter;
struct bkey_i *k = bch_keylist_front(keys);
+ struct bkey_packed *p;
+ struct bset *i;
BUG_ON(!b->level);
BUG_ON(b->keys.ops->is_extents);
bch_btree_node_iter_init(&node_iter, &b->keys, k->k.p, false);
- six_lock_write(&b->lock);
-
while (!bch_keylist_empty(keys)) {
k = bch_keylist_front(keys);
BUG_ON(bch_keylist_u64s(keys) >
bch_btree_keys_u64s_remaining(iter->c, b));
BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
-
- if (bkey_cmp(k->k.p, b->key.k.p) > 0) {
- BUG_ON(is_last);
- break;
- }
+ BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
bch_insert_fixup_btree_ptr(iter, b, k, &node_iter, &res->disk_res);
bch_keylist_pop_front(keys);
}
- six_unlock_write(&b->lock);
+ /*
+ * We can't tolerate whiteouts here - with whiteouts there can be
+ * duplicate keys, and it would be rather bad if we picked a duplicate
+ * for the pivot:
+ */
+ i = btree_bset_first(b);
+ p = i->start;
+ while (p != bset_bkey_last(i))
+ if (bkey_deleted(p)) {
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - p->u64s);
+ memmove_u64s_down(p, bkey_next(p),
+ (u64 *) bset_bkey_last(i) -
+ (u64 *) p);
+ } else
+ p = bkey_next(p);
+
+ BUG_ON(b->keys.nsets ||
+ b->keys.nr.live_u64s != le16_to_cpu(b->keys.set->data->u64s));
btree_node_interior_verify(b);
}
@@ -1275,9 +1296,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
struct cache_set *c = iter->c;
struct btree *parent = iter->nodes[b->level + 1];
struct btree *n1, *n2 = NULL, *n3 = NULL;
- uint64_t start_time = local_clock();
- unsigned u64s_to_insert = b->level
- ? bch_keylist_u64s(insert_keys) : 0;
+ u64 start_time = local_clock();
BUG_ON(!parent && (b != btree_node_root(b)));
BUG_ON(!btree_node_intent_locked(iter, btree_node_root(b)->level));
@@ -1285,59 +1304,16 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
bch_btree_interior_update_will_free_node(c, as, b);
n1 = btree_node_alloc_replacement(c, b, reserve);
-
- /*
- * For updates to interior nodes, we've got to do the insert
- * before we split because the stuff we're inserting has to be
- * inserted atomically. Post split, the keys might have to go in
- * different nodes and the split would no longer be atomic.
- *
- * Worse, if the insert is from btree node coalescing, if we do the
- * insert after we do the split (and pick the pivot) - the pivot we pick
- * might be between nodes that were coalesced, and thus in the middle of
- * a child node post coalescing:
- */
- if (b->level) {
- struct bkey_packed *k;
- struct bset *i;
-
- six_unlock_write(&n1->lock);
- btree_split_insert_keys(iter, n1, insert_keys, reserve, true);
- six_lock_write(&n1->lock);
-
- /*
- * There might be duplicate (deleted) keys after the
- * bch_btree_insert_keys() call - we need to remove them before
- * we split, as it would be rather bad if we picked a duplicate
- * for the pivot.
- *
- * Additionally, inserting might overwrite a bunch of existing
- * keys (i.e. a big discard when there were a bunch of small
- * extents previously) - we might not want to split after the
- * insert. Splitting a node that's too small to be split would
- * be bad (if the node had only one key, we wouldn't be able to
- * assign the new node a key different from the original node)
- */
- i = btree_bset_first(n1);
- k = i->start;
- while (k != bset_bkey_last(i))
- if (bkey_deleted(k)) {
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_next(k),
- (u64 *) bset_bkey_last(i) -
- (u64 *) k);
- } else
- k = bkey_next(k);
-
- btree_node_interior_verify(n1);
- }
+ if (b->level)
+ btree_split_insert_keys(iter, n1, insert_keys, reserve);
if (__set_blocks(n1->data,
- le16_to_cpu(n1->data->keys.u64s) + u64s_to_insert,
- block_bytes(n1->c)) > btree_blocks(c) * 3 / 4) {
- trace_bcache_btree_node_split(b, le16_to_cpu(btree_bset_first(n1)->u64s));
+ le16_to_cpu(n1->data->keys.u64s),
+ block_bytes(c)) > BTREE_SPLIT_THRESHOLD(c)) {
+ trace_bcache_btree_node_split(b, b->keys.nr.live_u64s);
n2 = __btree_split_node(iter, n1, reserve);
+ six_unlock_write(&n2->lock);
six_unlock_write(&n1->lock);
bch_btree_node_write(n2, &as->cl, NULL);
@@ -1357,11 +1333,11 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
reserve);
btree_split_insert_keys(iter, n3, &as->parent_keys,
- reserve, true);
+ reserve);
bch_btree_node_write(n3, &as->cl, NULL);
}
} else {
- trace_bcache_btree_node_compact(b, le16_to_cpu(btree_bset_first(n1)->u64s));
+ trace_bcache_btree_node_compact(b, b->keys.nr.live_u64s);
six_unlock_write(&n1->lock);
bch_keylist_add(&as->parent_keys, &n1->key);
@@ -1443,15 +1419,17 @@ void bch_btree_insert_node(struct btree *b,
}
}
-static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags,
- struct closure *cl)
+static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags)
{
struct cache_set *c = iter->c;
struct btree *b = iter->nodes[0];
struct btree_reserve *reserve;
struct btree_interior_update *as;
+ struct closure cl;
int ret = 0;
+ closure_init_stack(&cl);
+
/* Hack, because gc and splitting nodes doesn't mix yet: */
if (!down_read_trylock(&c->gc_lock)) {
bch_btree_iter_unlock(iter);
@@ -1468,9 +1446,14 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags,
}
reserve = bch_btree_reserve_get(c, b, 0,
- !(flags & BTREE_INSERT_NOFAIL), cl);
+ !(flags & BTREE_INSERT_NOFAIL), &cl);
if (IS_ERR(reserve)) {
ret = PTR_ERR(reserve);
+ if (ret == -EAGAIN) {
+ bch_btree_iter_unlock(iter);
+ closure_sync(&cl);
+ ret = -EINTR;
+ }
goto out;
}
@@ -1574,13 +1557,10 @@ int __bch_btree_insert_at(struct btree_insert *trans, u64 *journal_seq)
struct journal_res res = { 0, 0 };
struct btree_insert_entry *i;
struct btree_iter *split = NULL;
- struct closure cl;
bool cycle_gc_lock = false;
unsigned u64s;
int ret;
- closure_init_stack(&cl);
-
trans_for_each_entry(trans, i) {
EBUG_ON(i->iter->level);
EBUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
@@ -1689,7 +1669,7 @@ split:
* allocating new btree nodes, and holding a journal reservation
* potentially blocks the allocator:
*/
- ret = bch_btree_split_leaf(split, trans->flags, &cl);
+ ret = bch_btree_split_leaf(split, trans->flags);
if (ret)
goto err;
/*
@@ -1699,12 +1679,6 @@ split:
*/
goto retry_locks;
err:
- if (ret == -EAGAIN) {
- bch_btree_iter_unlock(split);
- closure_sync(&cl);
- ret = -EINTR;
- }
-
if (cycle_gc_lock) {
down_read(&c->gc_lock);
up_read(&c->gc_lock);
diff --git a/drivers/md/bcache/btree_update.h b/drivers/md/bcache/btree_update.h
index 09e8ca48fb3a..e13faf887b77 100644
--- a/drivers/md/bcache/btree_update.h
+++ b/drivers/md/bcache/btree_update.h
@@ -11,6 +11,8 @@ struct bkey_format_state;
struct bkey_format;
struct btree;
+#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4)
+
struct btree_reserve {
struct disk_reservation disk_res;
unsigned nr;