summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-11-06 02:59:39 -0900
committerKent Overstreet <kent.overstreet@gmail.com>2016-11-20 16:48:50 -0900
commit9476b1a499c1b66a714470efcbb1ba624c7e86bb (patch)
tree162c3758f399bbc246b0181432d00f7ca4a31cb2
parent11ca6f678fdb995882e74a3f12ec96facf3250ff (diff)
bcache: Foreground btree node merging
-rw-r--r--drivers/md/bcache/bset.c14
-rw-r--r--drivers/md/bcache/btree_cache.c2
-rw-r--r--drivers/md/bcache/btree_cache.h5
-rw-r--r--drivers/md/bcache/btree_gc.c2
-rw-r--r--drivers/md/bcache/btree_io.c2
-rw-r--r--drivers/md/bcache/btree_iter.c2
-rw-r--r--drivers/md/bcache/btree_types.h1
-rw-r--r--drivers/md/bcache/btree_update.c282
-rw-r--r--drivers/md/bcache/btree_update.h11
-rw-r--r--drivers/md/bcache/debug.c8
10 files changed, 303 insertions, 26 deletions
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index c4bf087235c6..f2cbfb08922a 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1635,7 +1635,7 @@ static struct btree_nr_keys btree_mergesort(struct btree_keys *dst,
{
struct bkey_format *in_f = &src->format;
struct bkey_format *out_f = &dst->format;
- struct bkey_packed *in, *out = dst_set->start;
+ struct bkey_packed *in, *out = bset_bkey_last(dst_set);
struct btree_nr_keys nr;
BUG_ON(filter);
@@ -1711,7 +1711,7 @@ static struct btree_nr_keys btree_mergesort_extents(struct btree_keys *dst,
btree_keys_account_key_add(&nr, 0, prev);
prev = bkey_next(prev);
} else {
- prev = dst_set->start;
+ prev = bset_bkey_last(dst_set);
}
bkey_copy(prev, &tmp.k);
@@ -1725,7 +1725,7 @@ static struct btree_nr_keys btree_mergesort_extents(struct btree_keys *dst,
btree_keys_account_key_add(&nr, 0, prev);
out = bkey_next(prev);
} else {
- out = dst_set->start;
+ out = bset_bkey_last(dst_set);
}
dst_set->u64s = cpu_to_le16((u64 *) out - dst_set->_data);
@@ -1754,6 +1754,8 @@ struct btree_nr_keys bch_sort_bsets(struct bset *dst, struct btree_keys *b,
bch_btree_node_iter_sort(iter, b);
}
+ dst->u64s = 0;
+
/*
* If we're only doing a partial sort (start != 0), then we can't merge
* extents because that might produce extents that overlap with 0 size
@@ -1802,7 +1804,11 @@ void bch_btree_sort_into(struct btree_keys *dst,
bch_time_stats_update(state->time, start_time);
- dst->nr = nr;
+ dst->nr.live_u64s += nr.live_u64s;
+ dst->nr.bset_u64s[0] += nr.bset_u64s[0];
+ dst->nr.packed_keys += nr.packed_keys;
+ dst->nr.unpacked_keys += nr.unpacked_keys;
+
dst->nsets = 0;
/* No auxiliary search tree yet */
dst->set->size = 0;
diff --git a/drivers/md/bcache/btree_cache.c b/drivers/md/bcache/btree_cache.c
index a997327eb37d..074eb6867355 100644
--- a/drivers/md/bcache/btree_cache.c
+++ b/drivers/md/bcache/btree_cache.c
@@ -539,6 +539,8 @@ out:
b->written = 0;
b->keys.nsets = 0;
b->keys.set[0].data = NULL;
+ b->sib_u64s[0] = 0;
+ b->sib_u64s[1] = 0;
bch_time_stats_update(&c->mca_alloc_time, start_time);
diff --git a/drivers/md/bcache/btree_cache.h b/drivers/md/bcache/btree_cache.h
index 5beec6e4e942..6f02c3aaa890 100644
--- a/drivers/md/bcache/btree_cache.h
+++ b/drivers/md/bcache/btree_cache.h
@@ -36,6 +36,11 @@ static inline size_t btree_bytes(struct cache_set *c)
return c->sb.btree_node_size << 9;
}
+static inline size_t btree_max_u64s(struct cache_set *c)
+{
+ return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+}
+
static inline size_t btree_pages(struct cache_set *c)
{
return c->sb.btree_node_size >> (PAGE_SHIFT - 9);
diff --git a/drivers/md/bcache/btree_gc.c b/drivers/md/bcache/btree_gc.c
index 4055264b6bb4..ae59348cf3b4 100644
--- a/drivers/md/bcache/btree_gc.c
+++ b/drivers/md/bcache/btree_gc.c
@@ -581,6 +581,8 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
struct btree *n = new_nodes[i];
recalc_packed_keys(n);
+ btree_node_reset_sib_u64s(n);
+
six_unlock_write(&n->lock);
bch_btree_node_write(n, &as->cl, NULL);
}
diff --git a/drivers/md/bcache/btree_io.c b/drivers/md/bcache/btree_io.c
index 423fcc5869a8..035ff2a7caff 100644
--- a/drivers/md/bcache/btree_io.c
+++ b/drivers/md/bcache/btree_io.c
@@ -382,6 +382,8 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
: bch_key_sort_fix_overlapping,
true);
+ btree_node_reset_sib_u64s(b);
+
err = "short btree key";
if (b->keys.set[0].size &&
bkey_cmp_packed(&b->keys.format, &b->key.k,
diff --git a/drivers/md/bcache/btree_iter.c b/drivers/md/bcache/btree_iter.c
index a0d7a204b03c..0f8049867f19 100644
--- a/drivers/md/bcache/btree_iter.c
+++ b/drivers/md/bcache/btree_iter.c
@@ -747,8 +747,6 @@ static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
struct cache_set *c = iter->c;
struct btree_iter *linked, *sorted_iters, **i;
retry_all:
- for_each_linked_btree_iter(iter, linked)
- bch_btree_iter_unlock(linked);
bch_btree_iter_unlock(iter);
if (ret != -ENOMEM && ret != -EINTR)
diff --git a/drivers/md/bcache/btree_types.h b/drivers/md/bcache/btree_types.h
index d4bfd711ac65..04ea88560af2 100644
--- a/drivers/md/bcache/btree_types.h
+++ b/drivers/md/bcache/btree_types.h
@@ -46,6 +46,7 @@ struct btree {
u16 written;
u8 level;
u8 btree_id;
+ u16 sib_u64s[2];
/*
* XXX: add a delete sequence number, so when btree_node_relock() fails
* because the lock sequence number has changed - i.e. the contents were
diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c
index 8a6ea6e180c6..3a812b88fbd0 100644
--- a/drivers/md/bcache/btree_update.c
+++ b/drivers/md/bcache/btree_update.c
@@ -26,12 +26,18 @@ static void btree_interior_update_updated_root(struct cache_set *,
void __bch_btree_calc_format(struct bkey_format_state *s, struct btree *b)
{
- struct btree_node_iter iter;
- struct bkey unpacked;
- struct bkey_s_c k;
+ struct bkey_packed *k;
+ struct bset_tree *t;
+ struct bkey uk;
- for_each_btree_node_key_unpack(&b->keys, k, &iter, &unpacked)
- bch_bkey_format_add_key(s, k.k);
+ for (t = b->keys.set; t <= b->keys.set + b->keys.nsets; t++)
+ for (k = t->data->start;
+ k != bset_bkey_last(t->data);
+ k = bkey_next(k))
+ if (!bkey_packed_is_whiteout(&b->keys, k)) {
+ uk = bkey_unpack_key(&b->keys.format, k);
+ bch_bkey_format_add_key(s, &uk);
+ }
}
static struct bkey_format bch_btree_calc_format(struct btree *b)
@@ -44,28 +50,34 @@ static struct bkey_format bch_btree_calc_format(struct btree *b)
return bch_bkey_format_done(&s);
}
-/**
- * btree_node_format_fits - check if we could rewrite node with a new format
- *
- * This assumes all keys can pack with the new format -- it just checks if
- * the re-packed keys would fit inside the node itself.
- */
-bool bch_btree_node_format_fits(struct btree *b, struct bkey_format *new_f)
+static size_t btree_node_u64s_with_format(struct btree *b,
+ struct bkey_format *new_f)
{
struct bkey_format *old_f = &b->keys.format;
/* stupid integer promotion rules */
- ssize_t new_u64s =
+ ssize_t delta =
(((int) new_f->key_u64s - old_f->key_u64s) *
(int) b->keys.nr.packed_keys) +
(((int) new_f->key_u64s - BKEY_U64s) *
(int) b->keys.nr.unpacked_keys);
- bch_verify_btree_nr_keys(&b->keys);
+ BUG_ON(delta + b->keys.nr.live_u64s < 0);
- BUG_ON(new_u64s + b->keys.nr.live_u64s < 0);
+ return b->keys.nr.live_u64s + delta;
+}
+
+/**
+ * btree_node_format_fits - check if we could rewrite node with a new format
+ *
+ * This assumes all keys can pack with the new format -- it just checks if
+ * the re-packed keys would fit inside the node itself.
+ */
+bool bch_btree_node_format_fits(struct btree *b, struct bkey_format *new_f)
+{
+ size_t u64s = btree_node_u64s_with_format(b, new_f);
- return __set_bytes(b->data, b->keys.nr.live_u64s + new_u64s) <
+ return __set_bytes(b->data, u64s) <
PAGE_SIZE << b->keys.page_order;
}
@@ -280,6 +292,7 @@ static struct btree *bch_btree_node_alloc(struct cache_set *c,
set_btree_node_dirty(b);
bch_bset_init_first(&b->keys, &b->data->keys);
+ memset(&b->keys.nr, 0, sizeof(b->keys.nr));
b->data->magic = cpu_to_le64(bset_magic(&c->disk_sb));
SET_BSET_BTREE_LEVEL(&b->data->keys, level);
@@ -306,6 +319,7 @@ struct btree *__btree_node_alloc_replacement(struct cache_set *c,
bch_btree_sort_into(&n->keys, &b->keys,
b->keys.ops->key_normalize,
&c->sort);
+ btree_node_reset_sib_u64s(n);
n->key.k.p = b->key.k.p;
trace_bcache_btree_node_alloc_replacement(b, n);
@@ -1224,6 +1238,9 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
n2->keys.set->size = 0;
n2->keys.set->extra = BSET_AUX_TREE_NONE_VAL;
+ btree_node_reset_sib_u64s(n1);
+ btree_node_reset_sib_u64s(n2);
+
bch_verify_btree_nr_keys(&n1->keys);
bch_verify_btree_nr_keys(&n2->keys);
@@ -1337,6 +1354,8 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
n3 = __btree_root_alloc(c, b->level + 1,
iter->btree_id,
reserve);
+ n3->sib_u64s[0] = U16_MAX;
+ n3->sib_u64s[1] = U16_MAX;
btree_split_insert_keys(iter, n3, &as->parent_keys,
reserve);
@@ -1474,6 +1493,214 @@ out:
return ret;
}
+enum btree_node_sibling {
+ btree_prev_sib,
+ btree_next_sib,
+};
+
+static struct btree *btree_node_get_sibling(struct btree_iter *iter,
+ struct btree *b,
+ enum btree_node_sibling sib)
+{
+ struct btree *parent;
+ struct btree_node_iter node_iter;
+ struct bkey_packed *k;
+ BKEY_PADDED(k) tmp;
+ struct btree *ret;
+ unsigned level = b->level;
+
+ parent = iter->nodes[level + 1];
+ if (!parent)
+ return NULL;
+
+ if (!btree_node_relock(iter, level + 1)) {
+ bch_btree_iter_set_locks_want(iter, level + 2);
+ return ERR_PTR(-EINTR);
+ }
+
+ node_iter = iter->node_iters[parent->level];
+
+ k = bch_btree_node_iter_peek_all(&node_iter, &parent->keys);
+ BUG_ON(bkey_cmp_left_packed(&parent->keys.format, k, b->key.k.p));
+
+ do {
+ k = sib == btree_prev_sib
+ ? bch_btree_node_iter_prev_all(&node_iter, &parent->keys)
+ : (bch_btree_node_iter_advance(&node_iter, &parent->keys),
+ bch_btree_node_iter_peek_all(&node_iter, &parent->keys));
+ if (!k)
+ return NULL;
+ } while (bkey_deleted(k));
+
+ bkey_unpack(&tmp.k, &parent->keys.format, k);
+
+ ret = bch_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent);
+
+ if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) {
+ btree_node_unlock(iter, level);
+ ret = bch_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent);
+ }
+
+ if (!IS_ERR(ret) && !btree_node_relock(iter, level)) {
+ six_unlock_intent(&ret->lock);
+ ret = ERR_PTR(-EINTR);
+ }
+
+ return ret;
+}
+
+static int foreground_maybe_merge(struct btree_iter *iter,
+ enum btree_node_sibling sib)
+{
+ struct cache_set *c = iter->c;
+ struct btree_reserve *reserve;
+ struct btree_interior_update *as;
+ struct bkey_format_state new_s;
+ struct bkey_format new_f;
+ struct bkey_i delete;
+ struct btree *b, *m, *n, *prev, *next, *parent;
+ struct closure cl;
+ size_t sib_u64s;
+ int ret = 0;
+
+ closure_init_stack(&cl);
+retry:
+ if (!btree_node_relock(iter, iter->level))
+ return 0;
+
+ b = iter->nodes[iter->level];
+
+ parent = iter->nodes[b->level + 1];
+ if (!parent)
+ return 0;
+
+ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
+ return 0;
+
+ /* XXX: can't be holding read locks */
+ m = btree_node_get_sibling(iter, b, sib);
+ if (IS_ERR(m)) {
+ ret = PTR_ERR(m);
+ goto out;
+ }
+
+ /* NULL means no sibling: */
+ if (!m) {
+ b->sib_u64s[sib] = U16_MAX;
+ return 0;
+ }
+
+ if (sib == btree_prev_sib) {
+ prev = m;
+ next = b;
+ } else {
+ prev = b;
+ next = m;
+ }
+
+ bch_bkey_format_init(&new_s);
+ __bch_btree_calc_format(&new_s, b);
+ __bch_btree_calc_format(&new_s, m);
+ new_f = bch_bkey_format_done(&new_s);
+
+ sib_u64s = btree_node_u64s_with_format(b, &new_f) +
+ btree_node_u64s_with_format(m, &new_f);
+
+ if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
+ sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+ sib_u64s /= 2;
+ sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+ }
+
+ sib_u64s = min(sib_u64s, btree_max_u64s(c));
+ b->sib_u64s[sib] = sib_u64s;
+
+ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
+ six_unlock_intent(&m->lock);
+ return 0;
+ }
+
+ /* We're changing btree topology, doesn't mix with gc: */
+ if (!down_read_trylock(&c->gc_lock)) {
+ six_unlock_intent(&m->lock);
+ bch_btree_iter_unlock(iter);
+
+ down_read(&c->gc_lock);
+ up_read(&c->gc_lock);
+ ret = -EINTR;
+ goto out;
+ }
+
+ if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) {
+ ret = -EINTR;
+ goto out_unlock;
+ }
+
+ reserve = bch_btree_reserve_get(c, b, 0, false, &cl);
+ if (IS_ERR(reserve)) {
+ ret = PTR_ERR(reserve);
+ goto out_unlock;
+ }
+
+ as = bch_btree_interior_update_alloc(c);
+
+ bch_btree_interior_update_will_free_node(c, as, b);
+ bch_btree_interior_update_will_free_node(c, as, m);
+
+ n = bch_btree_node_alloc(c, b->level, b->btree_id, reserve);
+ n->data->min_key = prev->data->min_key;
+ n->data->max_key = next->data->max_key;
+ n->data->format = new_f;
+ n->keys.format = new_f;
+ n->key.k.p = next->key.k.p;
+
+ bch_btree_sort_into(&n->keys, &prev->keys,
+ b->keys.ops->key_normalize,
+ &c->sort);
+ bch_btree_sort_into(&n->keys, &next->keys,
+ b->keys.ops->key_normalize,
+ &c->sort);
+ six_unlock_write(&n->lock);
+
+ bkey_init(&delete.k);
+ delete.k.p = prev->key.k.p;
+ bch_keylist_add(&as->parent_keys, &delete);
+ bch_keylist_add(&as->parent_keys, &n->key);
+
+ bch_btree_node_write(n, &as->cl, NULL);
+
+ bch_btree_insert_node(parent, iter, &as->parent_keys, reserve, as);
+
+ btree_open_bucket_put(c, n);
+ bch_btree_node_free_inmem(iter, b);
+ bch_btree_node_free_inmem(iter, m);
+ bch_btree_iter_node_replace(iter, n);
+
+ bch_btree_iter_verify(iter, n);
+
+ bch_btree_reserve_put(c, reserve);
+out_unlock:
+ if (ret != -EINTR && ret != -EAGAIN)
+ bch_btree_iter_set_locks_want(iter, 1);
+ six_unlock_intent(&m->lock);
+ up_read(&c->gc_lock);
+out:
+ if (ret == -EAGAIN || ret == -EINTR) {
+ bch_btree_iter_unlock(iter);
+ ret = -EINTR;
+ }
+
+ closure_sync(&cl);
+
+ if (ret == -EINTR) {
+ ret = bch_btree_iter_traverse(iter);
+ if (!ret)
+ goto retry;
+ }
+
+ return ret;
+}
+
/**
* btree_insert_key - insert a key one key into a leaf node
*/
@@ -1496,6 +1723,11 @@ btree_insert_key(struct btree_insert *trans,
live_u64s_added = (int) b->keys.nr.live_u64s - old_live_u64s;
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
if (u64s_added > live_u64s_added &&
bch_maybe_compact_deleted_keys(&b->keys))
bch_btree_iter_reinit_node(iter, b);
@@ -1658,9 +1890,23 @@ unlock:
if (ret)
goto err;
+ /*
+ * hack: iterators are inconsistent when they hit end of leaf, until
+ * traversed again
+ */
trans_for_each_entry(trans, i)
- if (!same_leaf_as_prev(trans, i))
- bch_btree_node_write_lazy(i->iter->nodes[0], i->iter);
+ if (i->iter->at_end_of_leaf)
+ goto out;
+
+ trans_for_each_entry(trans, i)
+ if (!same_leaf_as_prev(trans, i)) {
+ foreground_maybe_merge(i->iter, btree_prev_sib);
+ foreground_maybe_merge(i->iter, btree_next_sib);
+
+ if (btree_node_relock(i->iter, 0))
+ bch_btree_node_write_lazy(i->iter->nodes[0],
+ i->iter);
+ }
out:
/* make sure we didn't lose an error: */
if (!ret && IS_ENABLED(CONFIG_BCACHE_DEBUG))
diff --git a/drivers/md/bcache/btree_update.h b/drivers/md/bcache/btree_update.h
index e13faf887b77..22027678a41f 100644
--- a/drivers/md/bcache/btree_update.h
+++ b/drivers/md/bcache/btree_update.h
@@ -13,6 +13,17 @@ struct btree;
#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4)
+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
+
+static inline void btree_node_reset_sib_u64s(struct btree *b)
+{
+ b->sib_u64s[0] = b->keys.nr.live_u64s;
+ b->sib_u64s[1] = b->keys.nr.live_u64s;
+}
+
struct btree_reserve {
struct disk_reservation disk_res;
unsigned nr;
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 9dcdf99539e8..0597aed83408 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -10,6 +10,7 @@
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_iter.h"
+#include "btree_update.h"
#include "buckets.h"
#include "debug.h"
#include "error.h"
@@ -309,6 +310,7 @@ static int print_btree_node(struct dump_iter *i, struct btree *b)
"l %u %llu:%llu - %llu:%llu:\n"
" format: u64s %u fields %u %u %u %u %u\n"
" bytes used %zu/%zu (%zu%% full)\n"
+ " sib u64s: %u, %u (merge threshold %zu)\n"
" nr packed keys %u\n"
" nr unpacked keys %u\n"
" floats %zu\n"
@@ -328,8 +330,10 @@ static int print_btree_node(struct dump_iter *i, struct btree *b)
f->bits_per_field[4],
b->keys.nr.live_u64s * sizeof(u64),
btree_bytes(i->c) - sizeof(struct btree_node),
- (b->keys.nr.live_u64s * sizeof(u64) * 100) /
- (btree_bytes(i->c) - sizeof(struct btree_node)),
+ b->keys.nr.live_u64s * 100 / btree_max_u64s(i->c),
+ b->sib_u64s[0],
+ b->sib_u64s[1],
+ BTREE_FOREGROUND_MERGE_THRESHOLD(i->c),
b->keys.nr.packed_keys,
b->keys.nr.unpacked_keys,
stats.floats,