summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-11-23 16:10:05 -0900
committerKent Overstreet <kent.overstreet@gmail.com>2016-11-28 22:27:46 -0900
commit548f0f7b68212e51ab9f137313dfa58eb0bdbc64 (patch)
tree4bfa6b072350d2112a8a909f2f7459bfd0844227
parent3e36e8a802538591e65f63e5931bd9a3942b60cc (diff)
bcache: Whiteout optimizations: extents
-rw-r--r--drivers/md/bcache/bkey.h8
-rw-r--r--drivers/md/bcache/bkey_methods.c2
-rw-r--r--drivers/md/bcache/btree_io.c153
-rw-r--r--drivers/md/bcache/extents.c219
4 files changed, 322 insertions, 60 deletions
diff --git a/drivers/md/bcache/bkey.h b/drivers/md/bcache/bkey.h
index 1466e890dad1..acfdd813203b 100644
--- a/drivers/md/bcache/bkey.h
+++ b/drivers/md/bcache/bkey.h
@@ -398,6 +398,14 @@ void bkey_unpack(struct bkey_i *, const struct bkey_format *,
bool bkey_pack(struct bkey_packed *, const struct bkey_i *,
const struct bkey_format *);
+static inline u64 bkey_field_max(const struct bkey_format *f,
+ enum bch_bkey_fields nr)
+{
+ return f->bits_per_field[nr] < 64
+ ? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr])
+ : U64_MAX;
+}
+
/* Disassembled bkeys */
static inline struct bkey_s_c bkey_disassemble(const struct bkey_format *f,
diff --git a/drivers/md/bcache/bkey_methods.c b/drivers/md/bcache/bkey_methods.c
index 9755cc60557f..3bcd0e041b03 100644
--- a/drivers/md/bcache/bkey_methods.c
+++ b/drivers/md/bcache/bkey_methods.c
@@ -31,9 +31,9 @@ const char *bkey_invalid(struct cache_set *c, enum bkey_type type,
switch (k.k->type) {
case KEY_TYPE_DELETED:
+ case KEY_TYPE_DISCARD:
return NULL;
- case KEY_TYPE_DISCARD:
case KEY_TYPE_ERROR:
return bkey_val_bytes(k.k) != 0
? "value size should be zero"
diff --git a/drivers/md/bcache/btree_io.c b/drivers/md/bcache/btree_io.c
index 177027879a67..c8b88d16e21c 100644
--- a/drivers/md/bcache/btree_io.c
+++ b/drivers/md/bcache/btree_io.c
@@ -16,15 +16,23 @@
#include <trace/events/bcache.h>
-static void verify_no_dups(struct btree_keys *b,
+static void verify_no_dups(struct btree *b,
struct bkey_packed *start,
struct bkey_packed *end)
{
#ifdef CONFIG_BCACHE_DEBUG
+ const struct bkey_format *f = &b->keys.format;
struct bkey_packed *k;
- for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k))
- BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
+ for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
+ struct bkey l = bkey_unpack_key(f, k);
+ struct bkey r = bkey_unpack_key(f, bkey_next(k));
+
+ BUG_ON(btree_node_is_extents(b)
+ ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
+ : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
+ //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
+ }
#endif
}
@@ -177,10 +185,80 @@ static unsigned sort_key_whiteouts(struct bkey_packed *dst,
return (u64 *) out - (u64 *) dst;
}
+static inline int sort_extent_whiteouts_cmp(struct btree_keys *b,
+ struct bkey_packed *l,
+ struct bkey_packed *r)
+{
+ struct bkey ul = bkey_unpack_key(&b->format, l);
+ struct bkey ur = bkey_unpack_key(&b->format, r);
+
+ return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
+}
+
static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
struct sort_iter *iter)
{
- struct bkey_packed *out = dst;
+ const struct bkey_format *f = &iter->b->format;
+ struct bkey_packed *in, *out = dst;
+ struct bkey_i l, r;
+ bool prev = false, l_packed;
+ u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE);
+ u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET);
+ u64 new_size;
+
+ max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
+
+ sort_iter_sort(iter, sort_extent_whiteouts_cmp);
+
+ while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+ EBUG_ON(bkeyp_val_u64s(f, in));
+ EBUG_ON(in->type != KEY_TYPE_DISCARD);
+
+ r.k = bkey_unpack_key(f, in);
+
+ if (prev &&
+ bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
+ if (bkey_cmp(l.k.p, r.k.p) >= 0)
+ continue;
+
+ new_size = l_packed
+ ? min(max_packed_size, max_packed_offset -
+ bkey_start_offset(&l.k))
+ : KEY_SIZE_MAX;
+
+ new_size = min(new_size, r.k.p.offset -
+ bkey_start_offset(&l.k));
+
+ BUG_ON(new_size < l.k.size);
+
+ bch_key_resize(&l.k, new_size);
+
+ if (bkey_cmp(l.k.p, r.k.p) >= 0)
+ continue;
+
+ bch_cut_front(l.k.p, &r);
+ }
+
+ if (prev) {
+ if (!bkey_pack(out, &l, f)) {
+ BUG_ON(l_packed);
+ bkey_copy(out, &l);
+ }
+ out = bkey_next(out);
+ }
+
+ l = r;
+ prev = true;
+ l_packed = bkey_packed(in);
+ }
+
+ if (prev) {
+ if (!bkey_pack(out, &l, f)) {
+ BUG_ON(l_packed);
+ bkey_copy(out, &l);
+ }
+ out = bkey_next(out);
+ }
return (u64 *) out - (u64 *) dst;
}
@@ -224,9 +302,6 @@ static bool __compact_whiteouts(struct cache_set *c, struct btree *b,
unsigned order, whiteout_u64s = 0, u64s;
bool used_mempool, compacting = false;
- if (btree_node_is_extents(b))
- return false;
-
for_each_bset(&b->keys, t)
whiteout_u64s += should_compact_bset(b, t,
whiteout_u64s != 0, mode);
@@ -324,8 +399,18 @@ static bool __compact_whiteouts(struct cache_set *c, struct btree *b,
: sort_key_whiteouts(unwritten_whiteouts_start(c, b),
&sort_iter);
- BUG_ON(u64s != b->whiteout_u64s);
- verify_no_dups(&b->keys,
+ BUG_ON(u64s > b->whiteout_u64s);
+ BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
+ BUG_ON(u_pos != whiteouts && !u64s);
+
+ if (u64s != b->whiteout_u64s) {
+ void *src = unwritten_whiteouts_start(c, b);
+
+ b->whiteout_u64s = u64s;
+ memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
+ }
+
+ verify_no_dups(b,
unwritten_whiteouts_start(c, b),
unwritten_whiteouts_end(c, b));
@@ -455,7 +540,8 @@ static unsigned sort_extents(struct bkey_packed *dst,
if (bkey_deleted(in))
continue;
- if (filter_whiteouts && bkey_whiteout(in))
+ if (bkey_whiteout(in) &&
+ (filter_whiteouts || !in->needs_whiteout))
continue;
bkey_copy(out, in);
@@ -804,7 +890,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
unsigned *whiteout_u64s)
{
struct bkey_format *f = &b->keys.format;
- struct bkey_packed *k;
+ struct bkey_packed *k, *prev = NULL;
bool seen_non_whiteout = false;
if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION)
@@ -816,6 +902,11 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
if (i != &b->data->keys && !i->u64s)
btree_node_error(b, c, ptr, "empty set");
+ if (!BSET_SEPARATE_WHITEOUTS(i)) {
+ seen_non_whiteout = true;
+ whiteout_u64s = 0;
+ }
+
for (k = i->start;
k != bset_bkey_last(i);) {
struct bkey_s_c u;
@@ -861,7 +952,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
bch_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), u);
btree_node_error(b, c, ptr,
- "invalid bkey %s", buf);
+ "invalid bkey %s: %s", buf, invalid);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
@@ -869,13 +960,21 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
continue;
}
- if (BSET_SEPARATE_WHITEOUTS(i) &&
- !seen_non_whiteout &&
- !bkey_whiteout(k)) {
+ /*
+ * with the separate whiteouts thing (used for extents), the
+ * second set of keys actually can have whiteouts too, so we
+ * can't solely go off bkey_whiteout()...
+ */
+
+ if (!seen_non_whiteout &&
+ (!bkey_whiteout(k) ||
+ (prev && bkey_cmp_left_packed(f, prev,
+ bkey_start_pos(u.k)) > 0))) {
*whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true;
}
+ prev = k;
k = bkey_next(k);
}
@@ -1265,10 +1364,6 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
: sizeof(struct btree_node_entry);
bytes += b->whiteout_u64s * sizeof(u64);
- sort_iter_add(&sort_iter,
- unwritten_whiteouts_start(c, b),
- unwritten_whiteouts_end(c, b));
- b->whiteout_u64s = 0;
for_each_bset(&b->keys, t) {
i = t->data;
@@ -1295,11 +1390,27 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
}
i->journal_seq = cpu_to_le64(seq);
+ i->u64s = 0;
+
+ if (!btree_node_is_extents(b)) {
+ sort_iter_add(&sort_iter,
+ unwritten_whiteouts_start(c, b),
+ unwritten_whiteouts_end(c, b));
+ SET_BSET_SEPARATE_WHITEOUTS(i, false);
+ } else {
+ memcpy_u64s(i->start,
+ unwritten_whiteouts_start(c, b),
+ b->whiteout_u64s);
+ i->u64s = cpu_to_le16(b->whiteout_u64s);
+ SET_BSET_SEPARATE_WHITEOUTS(i, true);
+ }
+
+ b->whiteout_u64s = 0;
u64s = btree_node_is_extents(b)
- ? sort_extents(i->start, &sort_iter, false)
+ ? sort_extents(bset_bkey_last(i), &sort_iter, false)
: sort_keys(i->start, &sort_iter, false);
- i->u64s = cpu_to_le16(u64s);
+ le16_add_cpu(&i->u64s, u64s);
clear_needs_whiteout(i);
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 625fcf1f28b6..4a13d90d7e57 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -1048,6 +1048,10 @@ struct extent_insert_state {
struct btree_insert_entry *insert;
struct bpos committed;
struct bucket_stats_cache_set stats;
+
+ /* for deleting: */
+ struct bkey_i whiteout;
+ bool do_journal;
};
static enum btree_insert_ret
@@ -1118,51 +1122,56 @@ drop_deleted_keys:
static void extent_insert_committed(struct extent_insert_state *s)
{
struct cache_set *c = s->trans->c;
- struct bkey_i *insert = s->insert->k;
struct btree_iter *iter = s->insert->iter;
+ struct bkey_i *insert = !bkey_whiteout(&s->insert->k->k)
+ ? s->insert->k
+ : &s->whiteout;
+ BKEY_PADDED(k) split;
- EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos));
EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
- EBUG_ON(bkey_cmp(s->committed, iter->pos) < 0);
+ EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
- if (bkey_cmp(s->committed, iter->pos) > 0) {
- BKEY_PADDED(k) split;
+ if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
+ return;
- EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+ if (bkey_whiteout(&s->insert->k->k) && !s->do_journal) {
+ bch_cut_front(s->committed, insert);
+ goto done;
+ }
- bkey_copy(&split.k, insert);
+ EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
- if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
- bkey_cmp(s->committed, insert->k.p) &&
- bkey_extent_is_compressed(c, bkey_i_to_s_c(insert))) {
- /* XXX: possibly need to increase our reservation? */
- bch_cut_subtract_back(iter, s->committed,
- bkey_i_to_s(&split.k), &s->stats);
- bch_cut_front(s->committed, insert);
- bch_add_sectors(iter, bkey_i_to_s_c(insert),
- bkey_start_offset(&insert->k),
- insert->k.size, &s->stats);
- } else {
- bch_cut_back(s->committed, &split.k.k);
- bch_cut_front(s->committed, insert);
- }
+ bkey_copy(&split.k, insert);
- if (debug_check_bkeys(c))
- bkey_debugcheck(c, iter->nodes[iter->level],
- bkey_i_to_s_c(&split.k));
+ if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
+ bkey_cmp(s->committed, insert->k.p) &&
+ bkey_extent_is_compressed(c, bkey_i_to_s_c(insert))) {
+ /* XXX: possibly need to increase our reservation? */
+ bch_cut_subtract_back(iter, s->committed,
+ bkey_i_to_s(&split.k), &s->stats);
+ bch_cut_front(s->committed, insert);
+ bch_add_sectors(iter, bkey_i_to_s_c(insert),
+ bkey_start_offset(&insert->k),
+ insert->k.size, &s->stats);
+ } else {
+ bch_cut_back(s->committed, &split.k.k);
+ bch_cut_front(s->committed, insert);
+ }
- /*
- * XXX: if this is a delete, and we didn't overlap with
- * anything, we don't need to journal this:
- */
- bch_btree_journal_key(s->trans, iter, &split.k);
+ if (debug_check_bkeys(c))
+ bkey_debugcheck(c, iter->nodes[iter->level],
+ bkey_i_to_s_c(&split.k));
- extent_bset_insert(c, iter, &split.k);
+ bch_btree_journal_key(s->trans, iter, &split.k);
- bch_btree_iter_set_pos_same_leaf(iter, s->committed);
+ if (!bkey_whiteout(&split.k.k))
+ extent_bset_insert(c, iter, &split.k);
+done:
+ bch_btree_iter_set_pos_same_leaf(iter, s->committed);
- s->trans->did_work = true;
- }
+ insert->k.needs_whiteout = false;
+ s->do_journal = false;
+ s->trans->did_work = true;
}
static enum extent_insert_hook_ret
@@ -1251,8 +1260,7 @@ extent_insert_check_split_compressed(struct extent_insert_state *s,
struct cache_set *c = s->trans->c;
unsigned sectors;
- if (k.k->size &&
- overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
+ if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
(sectors = bkey_extent_is_compressed(c, k))) {
int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
@@ -1333,9 +1341,9 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
extent_save(&b->keys, node_iter, _k, k.k);
if (extent_insert_advance_pos(s, k.s_c) ==
- BTREE_HOOK_RESTART_TRANS) {
+ BTREE_HOOK_RESTART_TRANS)
return BTREE_INSERT_NEED_TRAVERSE;
- }
+
extent_insert_committed(s);
/*
* We split and inserted upto at k.k->p - that
@@ -1369,6 +1377,8 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
* what k points to)
*/
bkey_reassemble(&split.k, k.s_c);
+ split.k.k.needs_whiteout |= bset_written(b, t->data);
+
bch_cut_back(bkey_start_pos(&insert->k), &split.k.k);
BUG_ON(bkey_deleted(&split.k.k));
@@ -1387,6 +1397,121 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
return BTREE_INSERT_OK;
}
+static enum btree_insert_ret
+bch_delete_fixup_extent(struct extent_insert_state *s)
+{
+ struct cache_set *c = s->trans->c;
+ struct btree_iter *iter = s->insert->iter;
+ struct btree *b = iter->nodes[0];
+ struct btree_node_iter *node_iter = &iter->node_iters[0];
+ const struct bkey_format *f = &b->keys.format;
+ struct bkey_packed *_k;
+ struct bkey unpacked;
+ struct bkey_i *insert = s->insert->k;
+ enum btree_insert_ret ret = BTREE_INSERT_OK;
+
+ EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+
+ s->whiteout = *insert;
+ s->do_journal = false;
+
+ while (bkey_cmp(s->committed, insert->k.p) < 0 &&
+ (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
+ (_k = bch_btree_node_iter_peek_all(node_iter, &b->keys))) {
+ struct bset_tree *t = bch_bkey_to_bset(&b->keys, _k);
+ struct bkey_s k = __bkey_disassemble(f, _k, &unpacked);
+ enum bch_extent_overlap overlap;
+
+ EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+ EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
+
+ if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+ break;
+
+ if (bkey_whiteout(k.k)) {
+ s->committed = bpos_min(insert->k.p, k.k->p);
+ goto next;
+ }
+
+ overlap = bch_extent_overlap(&insert->k, k.k);
+
+ ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
+ if (ret != BTREE_INSERT_OK)
+ goto stop;
+
+ switch (extent_insert_advance_pos(s, k.s_c)) {
+ case BTREE_HOOK_DO_INSERT:
+ break;
+ case BTREE_HOOK_NO_INSERT:
+ continue;
+ case BTREE_HOOK_RESTART_TRANS:
+ ret = BTREE_INSERT_NEED_TRAVERSE;
+ goto stop;
+ }
+
+ s->do_journal = true;
+
+ if (overlap == BCH_EXTENT_OVERLAP_ALL) {
+ btree_keys_account_key_drop(&b->keys.nr,
+ t - b->keys.set, _k);
+ bch_subtract_sectors(iter, k.s_c,
+ bkey_start_offset(k.k), k.k->size,
+ &s->stats);
+ _k->type = KEY_TYPE_DISCARD;
+ reserve_whiteout(b, t, _k);
+ } else if (k.k->needs_whiteout ||
+ bset_written(b, t->data)) {
+ struct bkey_i discard = *insert;
+
+ switch (overlap) {
+ case BCH_EXTENT_OVERLAP_FRONT:
+ bch_cut_front(bkey_start_pos(k.k), &discard);
+ break;
+ case BCH_EXTENT_OVERLAP_BACK:
+ bch_cut_back(k.k->p, &discard.k);
+ break;
+ default:
+ break;
+ }
+
+ discard.k.needs_whiteout = true;
+
+ ret = extent_squash(s, insert, t, _k, k, overlap);
+ BUG_ON(ret != BTREE_INSERT_OK);
+
+ extent_bset_insert(c, iter, &discard);
+ } else {
+ ret = extent_squash(s, insert, t, _k, k, overlap);
+ BUG_ON(ret != BTREE_INSERT_OK);
+ }
+next:
+ bch_cut_front(s->committed, insert);
+ bch_btree_iter_set_pos_same_leaf(iter, s->committed);
+ }
+
+ if (bkey_cmp(s->committed, insert->k.p) < 0 &&
+ ret == BTREE_INSERT_OK &&
+ extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
+ ret = BTREE_INSERT_NEED_TRAVERSE;
+stop:
+ extent_insert_committed(s);
+
+ bch_cache_set_stats_apply(c, &s->stats, s->trans->disk_res,
+ gc_pos_btree_node(b));
+
+ EBUG_ON(bkey_cmp(iter->pos, s->committed));
+ EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
+
+ bch_cut_front(iter->pos, insert);
+
+ if (insert->k.size && iter->at_end_of_leaf)
+ ret = BTREE_INSERT_NEED_TRAVERSE;
+
+ EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
+
+ return ret;
+}
+
/**
* bch_extent_insert_fixup - insert a new extent and deal with overlaps
*
@@ -1448,6 +1573,9 @@ bch_insert_fixup_extent(struct btree_insert *trans,
EBUG_ON(iter->level);
EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size);
+ if (bkey_whiteout(&insert->k->k))
+ return bch_delete_fixup_extent(&s);
+
/*
* As we process overlapping extents, we advance @iter->pos both to
* signal to our caller (btree_insert_key()) how much of @insert->k has
@@ -1497,8 +1625,21 @@ bch_insert_fixup_extent(struct btree_insert *trans,
ret = BTREE_INSERT_NEED_TRAVERSE;
goto stop;
}
+
+ if (k.k->size &&
+ (k.k->needs_whiteout || bset_written(b, t->data)))
+ insert->k->k.needs_whiteout = true;
+
+ if (overlap == BCH_EXTENT_OVERLAP_ALL &&
+ bkey_whiteout(k.k) &&
+ k.k->needs_whiteout) {
+ unreserve_whiteout(b, t, _k);
+ _k->needs_whiteout = false;
+ }
squash:
- extent_squash(&s, insert->k, t, _k, k, overlap);
+ ret = extent_squash(&s, insert->k, t, _k, k, overlap);
+ if (ret != BTREE_INSERT_OK)
+ goto stop;
}
if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
@@ -2146,6 +2287,8 @@ static enum merge_result bch_extent_merge(struct cache_set *c,
return BCH_MERGE_NOMERGE;
}
+ l->k.needs_whiteout |= r->k.needs_whiteout;
+
/* Keys with no pointers aren't restricted to one bucket and could
* overflow KEY_SIZE
*/