diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-11-06 02:59:39 -0900 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2016-11-20 16:48:50 -0900 |
commit | 9476b1a499c1b66a714470efcbb1ba624c7e86bb (patch) | |
tree | 162c3758f399bbc246b0181432d00f7ca4a31cb2 | |
parent | 11ca6f678fdb995882e74a3f12ec96facf3250ff (diff) |
bcache: Foreground btree node merging
-rw-r--r-- | drivers/md/bcache/bset.c | 14 | ||||
-rw-r--r-- | drivers/md/bcache/btree_cache.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/btree_cache.h | 5 | ||||
-rw-r--r-- | drivers/md/bcache/btree_gc.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/btree_io.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/btree_iter.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/btree_types.h | 1 | ||||
-rw-r--r-- | drivers/md/bcache/btree_update.c | 282 | ||||
-rw-r--r-- | drivers/md/bcache/btree_update.h | 11 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 8 |
10 files changed, 303 insertions, 26 deletions
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index c4bf087235c6..f2cbfb08922a 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1635,7 +1635,7 @@ static struct btree_nr_keys btree_mergesort(struct btree_keys *dst, { struct bkey_format *in_f = &src->format; struct bkey_format *out_f = &dst->format; - struct bkey_packed *in, *out = dst_set->start; + struct bkey_packed *in, *out = bset_bkey_last(dst_set); struct btree_nr_keys nr; BUG_ON(filter); @@ -1711,7 +1711,7 @@ static struct btree_nr_keys btree_mergesort_extents(struct btree_keys *dst, btree_keys_account_key_add(&nr, 0, prev); prev = bkey_next(prev); } else { - prev = dst_set->start; + prev = bset_bkey_last(dst_set); } bkey_copy(prev, &tmp.k); @@ -1725,7 +1725,7 @@ static struct btree_nr_keys btree_mergesort_extents(struct btree_keys *dst, btree_keys_account_key_add(&nr, 0, prev); out = bkey_next(prev); } else { - out = dst_set->start; + out = bset_bkey_last(dst_set); } dst_set->u64s = cpu_to_le16((u64 *) out - dst_set->_data); @@ -1754,6 +1754,8 @@ struct btree_nr_keys bch_sort_bsets(struct bset *dst, struct btree_keys *b, bch_btree_node_iter_sort(iter, b); } + dst->u64s = 0; + /* * If we're only doing a partial sort (start != 0), then we can't merge * extents because that might produce extents that overlap with 0 size @@ -1802,7 +1804,11 @@ void bch_btree_sort_into(struct btree_keys *dst, bch_time_stats_update(state->time, start_time); - dst->nr = nr; + dst->nr.live_u64s += nr.live_u64s; + dst->nr.bset_u64s[0] += nr.bset_u64s[0]; + dst->nr.packed_keys += nr.packed_keys; + dst->nr.unpacked_keys += nr.unpacked_keys; + dst->nsets = 0; /* No auxiliary search tree yet */ dst->set->size = 0; diff --git a/drivers/md/bcache/btree_cache.c b/drivers/md/bcache/btree_cache.c index a997327eb37d..074eb6867355 100644 --- a/drivers/md/bcache/btree_cache.c +++ b/drivers/md/bcache/btree_cache.c @@ -539,6 +539,8 @@ out: b->written = 0; b->keys.nsets = 0; b->keys.set[0].data = NULL; + b->sib_u64s[0] = 0; + b->sib_u64s[1] = 0; bch_time_stats_update(&c->mca_alloc_time, start_time); diff --git a/drivers/md/bcache/btree_cache.h b/drivers/md/bcache/btree_cache.h index 5beec6e4e942..6f02c3aaa890 100644 --- a/drivers/md/bcache/btree_cache.h +++ b/drivers/md/bcache/btree_cache.h @@ -36,6 +36,11 @@ static inline size_t btree_bytes(struct cache_set *c) return c->sb.btree_node_size << 9; } +static inline size_t btree_max_u64s(struct cache_set *c) +{ + return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); +} + static inline size_t btree_pages(struct cache_set *c) { return c->sb.btree_node_size >> (PAGE_SHIFT - 9); diff --git a/drivers/md/bcache/btree_gc.c b/drivers/md/bcache/btree_gc.c index 4055264b6bb4..ae59348cf3b4 100644 --- a/drivers/md/bcache/btree_gc.c +++ b/drivers/md/bcache/btree_gc.c @@ -581,6 +581,8 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], struct btree *n = new_nodes[i]; recalc_packed_keys(n); + btree_node_reset_sib_u64s(n); + six_unlock_write(&n->lock); bch_btree_node_write(n, &as->cl, NULL); } diff --git a/drivers/md/bcache/btree_io.c b/drivers/md/bcache/btree_io.c index 423fcc5869a8..035ff2a7caff 100644 --- a/drivers/md/bcache/btree_io.c +++ b/drivers/md/bcache/btree_io.c @@ -382,6 +382,8 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b, : bch_key_sort_fix_overlapping, true); + btree_node_reset_sib_u64s(b); + err = "short btree key"; if (b->keys.set[0].size && bkey_cmp_packed(&b->keys.format, &b->key.k, diff --git a/drivers/md/bcache/btree_iter.c b/drivers/md/bcache/btree_iter.c index a0d7a204b03c..0f8049867f19 100644 --- a/drivers/md/bcache/btree_iter.c +++ b/drivers/md/bcache/btree_iter.c @@ -747,8 +747,6 @@ static int btree_iter_traverse_error(struct btree_iter *iter, int ret) struct cache_set *c = iter->c; struct btree_iter *linked, *sorted_iters, **i; retry_all: - for_each_linked_btree_iter(iter, linked) - bch_btree_iter_unlock(linked); bch_btree_iter_unlock(iter); if (ret != -ENOMEM && ret != -EINTR) diff --git a/drivers/md/bcache/btree_types.h b/drivers/md/bcache/btree_types.h index d4bfd711ac65..04ea88560af2 100644 --- a/drivers/md/bcache/btree_types.h +++ b/drivers/md/bcache/btree_types.h @@ -46,6 +46,7 @@ struct btree { u16 written; u8 level; u8 btree_id; + u16 sib_u64s[2]; /* * XXX: add a delete sequence number, so when btree_node_relock() fails * because the lock sequence number has changed - i.e. the contents were diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c index 8a6ea6e180c6..3a812b88fbd0 100644 --- a/drivers/md/bcache/btree_update.c +++ b/drivers/md/bcache/btree_update.c @@ -26,12 +26,18 @@ static void btree_interior_update_updated_root(struct cache_set *, void __bch_btree_calc_format(struct bkey_format_state *s, struct btree *b) { - struct btree_node_iter iter; - struct bkey unpacked; - struct bkey_s_c k; + struct bkey_packed *k; + struct bset_tree *t; + struct bkey uk; - for_each_btree_node_key_unpack(&b->keys, k, &iter, &unpacked) - bch_bkey_format_add_key(s, k.k); + for (t = b->keys.set; t <= b->keys.set + b->keys.nsets; t++) + for (k = t->data->start; + k != bset_bkey_last(t->data); + k = bkey_next(k)) + if (!bkey_packed_is_whiteout(&b->keys, k)) { + uk = bkey_unpack_key(&b->keys.format, k); + bch_bkey_format_add_key(s, &uk); + } } static struct bkey_format bch_btree_calc_format(struct btree *b) @@ -44,28 +50,34 @@ static struct bkey_format bch_btree_calc_format(struct btree *b) return bch_bkey_format_done(&s); } -/** - * btree_node_format_fits - check if we could rewrite node with a new format - * - * This assumes all keys can pack with the new format -- it just checks if - * the re-packed keys would fit inside the node itself. - */ -bool bch_btree_node_format_fits(struct btree *b, struct bkey_format *new_f) +static size_t btree_node_u64s_with_format(struct btree *b, + struct bkey_format *new_f) { struct bkey_format *old_f = &b->keys.format; /* stupid integer promotion rules */ - ssize_t new_u64s = + ssize_t delta = (((int) new_f->key_u64s - old_f->key_u64s) * (int) b->keys.nr.packed_keys) + (((int) new_f->key_u64s - BKEY_U64s) * (int) b->keys.nr.unpacked_keys); - bch_verify_btree_nr_keys(&b->keys); + BUG_ON(delta + b->keys.nr.live_u64s < 0); - BUG_ON(new_u64s + b->keys.nr.live_u64s < 0); + return b->keys.nr.live_u64s + delta; +} + +/** + * btree_node_format_fits - check if we could rewrite node with a new format + * + * This assumes all keys can pack with the new format -- it just checks if + * the re-packed keys would fit inside the node itself. + */ +bool bch_btree_node_format_fits(struct btree *b, struct bkey_format *new_f) +{ + size_t u64s = btree_node_u64s_with_format(b, new_f); - return __set_bytes(b->data, b->keys.nr.live_u64s + new_u64s) < + return __set_bytes(b->data, u64s) < PAGE_SIZE << b->keys.page_order; } @@ -280,6 +292,7 @@ static struct btree *bch_btree_node_alloc(struct cache_set *c, set_btree_node_dirty(b); bch_bset_init_first(&b->keys, &b->data->keys); + memset(&b->keys.nr, 0, sizeof(b->keys.nr)); b->data->magic = cpu_to_le64(bset_magic(&c->disk_sb)); SET_BSET_BTREE_LEVEL(&b->data->keys, level); @@ -306,6 +319,7 @@ struct btree *__btree_node_alloc_replacement(struct cache_set *c, bch_btree_sort_into(&n->keys, &b->keys, b->keys.ops->key_normalize, &c->sort); + btree_node_reset_sib_u64s(n); n->key.k.p = b->key.k.p; trace_bcache_btree_node_alloc_replacement(b, n); @@ -1224,6 +1238,9 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n n2->keys.set->size = 0; n2->keys.set->extra = BSET_AUX_TREE_NONE_VAL; + btree_node_reset_sib_u64s(n1); + btree_node_reset_sib_u64s(n2); + bch_verify_btree_nr_keys(&n1->keys); bch_verify_btree_nr_keys(&n2->keys); @@ -1337,6 +1354,8 @@ static void btree_split(struct btree *b, struct btree_iter *iter, n3 = __btree_root_alloc(c, b->level + 1, iter->btree_id, reserve); + n3->sib_u64s[0] = U16_MAX; + n3->sib_u64s[1] = U16_MAX; btree_split_insert_keys(iter, n3, &as->parent_keys, reserve); @@ -1474,6 +1493,214 @@ out: return ret; } +enum btree_node_sibling { + btree_prev_sib, + btree_next_sib, +}; + +static struct btree *btree_node_get_sibling(struct btree_iter *iter, + struct btree *b, + enum btree_node_sibling sib) +{ + struct btree *parent; + struct btree_node_iter node_iter; + struct bkey_packed *k; + BKEY_PADDED(k) tmp; + struct btree *ret; + unsigned level = b->level; + + parent = iter->nodes[level + 1]; + if (!parent) + return NULL; + + if (!btree_node_relock(iter, level + 1)) { + bch_btree_iter_set_locks_want(iter, level + 2); + return ERR_PTR(-EINTR); + } + + node_iter = iter->node_iters[parent->level]; + + k = bch_btree_node_iter_peek_all(&node_iter, &parent->keys); + BUG_ON(bkey_cmp_left_packed(&parent->keys.format, k, b->key.k.p)); + + do { + k = sib == btree_prev_sib + ? bch_btree_node_iter_prev_all(&node_iter, &parent->keys) + : (bch_btree_node_iter_advance(&node_iter, &parent->keys), + bch_btree_node_iter_peek_all(&node_iter, &parent->keys)); + if (!k) + return NULL; + } while (bkey_deleted(k)); + + bkey_unpack(&tmp.k, &parent->keys.format, k); + + ret = bch_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent); + + if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) { + btree_node_unlock(iter, level); + ret = bch_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent); + } + + if (!IS_ERR(ret) && !btree_node_relock(iter, level)) { + six_unlock_intent(&ret->lock); + ret = ERR_PTR(-EINTR); + } + + return ret; +} + +static int foreground_maybe_merge(struct btree_iter *iter, + enum btree_node_sibling sib) +{ + struct cache_set *c = iter->c; + struct btree_reserve *reserve; + struct btree_interior_update *as; + struct bkey_format_state new_s; + struct bkey_format new_f; + struct bkey_i delete; + struct btree *b, *m, *n, *prev, *next, *parent; + struct closure cl; + size_t sib_u64s; + int ret = 0; + + closure_init_stack(&cl); +retry: + if (!btree_node_relock(iter, iter->level)) + return 0; + + b = iter->nodes[iter->level]; + + parent = iter->nodes[b->level + 1]; + if (!parent) + return 0; + + if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) + return 0; + + /* XXX: can't be holding read locks */ + m = btree_node_get_sibling(iter, b, sib); + if (IS_ERR(m)) { + ret = PTR_ERR(m); + goto out; + } + + /* NULL means no sibling: */ + if (!m) { + b->sib_u64s[sib] = U16_MAX; + return 0; + } + + if (sib == btree_prev_sib) { + prev = m; + next = b; + } else { + prev = b; + next = m; + } + + bch_bkey_format_init(&new_s); + __bch_btree_calc_format(&new_s, b); + __bch_btree_calc_format(&new_s, m); + new_f = bch_bkey_format_done(&new_s); + + sib_u64s = btree_node_u64s_with_format(b, &new_f) + + btree_node_u64s_with_format(m, &new_f); + + if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { + sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); + sib_u64s /= 2; + sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); + } + + sib_u64s = min(sib_u64s, btree_max_u64s(c)); + b->sib_u64s[sib] = sib_u64s; + + if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { + six_unlock_intent(&m->lock); + return 0; + } + + /* We're changing btree topology, doesn't mix with gc: */ + if (!down_read_trylock(&c->gc_lock)) { + six_unlock_intent(&m->lock); + bch_btree_iter_unlock(iter); + + down_read(&c->gc_lock); + up_read(&c->gc_lock); + ret = -EINTR; + goto out; + } + + if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) { + ret = -EINTR; + goto out_unlock; + } + + reserve = bch_btree_reserve_get(c, b, 0, false, &cl); + if (IS_ERR(reserve)) { + ret = PTR_ERR(reserve); + goto out_unlock; + } + + as = bch_btree_interior_update_alloc(c); + + bch_btree_interior_update_will_free_node(c, as, b); + bch_btree_interior_update_will_free_node(c, as, m); + + n = bch_btree_node_alloc(c, b->level, b->btree_id, reserve); + n->data->min_key = prev->data->min_key; + n->data->max_key = next->data->max_key; + n->data->format = new_f; + n->keys.format = new_f; + n->key.k.p = next->key.k.p; + + bch_btree_sort_into(&n->keys, &prev->keys, + b->keys.ops->key_normalize, + &c->sort); + bch_btree_sort_into(&n->keys, &next->keys, + b->keys.ops->key_normalize, + &c->sort); + six_unlock_write(&n->lock); + + bkey_init(&delete.k); + delete.k.p = prev->key.k.p; + bch_keylist_add(&as->parent_keys, &delete); + bch_keylist_add(&as->parent_keys, &n->key); + + bch_btree_node_write(n, &as->cl, NULL); + + bch_btree_insert_node(parent, iter, &as->parent_keys, reserve, as); + + btree_open_bucket_put(c, n); + bch_btree_node_free_inmem(iter, b); + bch_btree_node_free_inmem(iter, m); + bch_btree_iter_node_replace(iter, n); + + bch_btree_iter_verify(iter, n); + + bch_btree_reserve_put(c, reserve); +out_unlock: + if (ret != -EINTR && ret != -EAGAIN) + bch_btree_iter_set_locks_want(iter, 1); + six_unlock_intent(&m->lock); + up_read(&c->gc_lock); +out: + if (ret == -EAGAIN || ret == -EINTR) { + bch_btree_iter_unlock(iter); + ret = -EINTR; + } + + closure_sync(&cl); + + if (ret == -EINTR) { + ret = bch_btree_iter_traverse(iter); + if (!ret) + goto retry; + } + + return ret; +} + /** * btree_insert_key - insert a key one key into a leaf node */ @@ -1496,6 +1723,11 @@ btree_insert_key(struct btree_insert *trans, live_u64s_added = (int) b->keys.nr.live_u64s - old_live_u64s; u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; + if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); + if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); + if (u64s_added > live_u64s_added && bch_maybe_compact_deleted_keys(&b->keys)) bch_btree_iter_reinit_node(iter, b); @@ -1658,9 +1890,23 @@ unlock: if (ret) goto err; + /* + * hack: iterators are inconsistent when they hit end of leaf, until + * traversed again + */ trans_for_each_entry(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch_btree_node_write_lazy(i->iter->nodes[0], i->iter); + if (i->iter->at_end_of_leaf) + goto out; + + trans_for_each_entry(trans, i) + if (!same_leaf_as_prev(trans, i)) { + foreground_maybe_merge(i->iter, btree_prev_sib); + foreground_maybe_merge(i->iter, btree_next_sib); + + if (btree_node_relock(i->iter, 0)) + bch_btree_node_write_lazy(i->iter->nodes[0], + i->iter); + } out: /* make sure we didn't lose an error: */ if (!ret && IS_ENABLED(CONFIG_BCACHE_DEBUG)) diff --git a/drivers/md/bcache/btree_update.h b/drivers/md/bcache/btree_update.h index e13faf887b77..22027678a41f 100644 --- a/drivers/md/bcache/btree_update.h +++ b/drivers/md/bcache/btree_update.h @@ -13,6 +13,17 @@ struct btree; #define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4) +#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) +#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) + +static inline void btree_node_reset_sib_u64s(struct btree *b) +{ + b->sib_u64s[0] = b->keys.nr.live_u64s; + b->sib_u64s[1] = b->keys.nr.live_u64s; +} + struct btree_reserve { struct disk_reservation disk_res; unsigned nr; diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 9dcdf99539e8..0597aed83408 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -10,6 +10,7 @@ #include "btree_cache.h" #include "btree_io.h" #include "btree_iter.h" +#include "btree_update.h" #include "buckets.h" #include "debug.h" #include "error.h" @@ -309,6 +310,7 @@ static int print_btree_node(struct dump_iter *i, struct btree *b) "l %u %llu:%llu - %llu:%llu:\n" " format: u64s %u fields %u %u %u %u %u\n" " bytes used %zu/%zu (%zu%% full)\n" + " sib u64s: %u, %u (merge threshold %zu)\n" " nr packed keys %u\n" " nr unpacked keys %u\n" " floats %zu\n" @@ -328,8 +330,10 @@ static int print_btree_node(struct dump_iter *i, struct btree *b) f->bits_per_field[4], b->keys.nr.live_u64s * sizeof(u64), btree_bytes(i->c) - sizeof(struct btree_node), - (b->keys.nr.live_u64s * sizeof(u64) * 100) / - (btree_bytes(i->c) - sizeof(struct btree_node)), + b->keys.nr.live_u64s * 100 / btree_max_u64s(i->c), + b->sib_u64s[0], + b->sib_u64s[1], + BTREE_FOREGROUND_MERGE_THRESHOLD(i->c), b->keys.nr.packed_keys, b->keys.nr.unpacked_keys, stats.floats, |