diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-11-23 16:10:05 -0900 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2016-11-28 22:27:46 -0900 |
commit | 548f0f7b68212e51ab9f137313dfa58eb0bdbc64 (patch) | |
tree | 4bfa6b072350d2112a8a909f2f7459bfd0844227 | |
parent | 3e36e8a802538591e65f63e5931bd9a3942b60cc (diff) |
bcache: Whiteout optimizations: extents
-rw-r--r-- | drivers/md/bcache/bkey.h | 8 | ||||
-rw-r--r-- | drivers/md/bcache/bkey_methods.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/btree_io.c | 153 | ||||
-rw-r--r-- | drivers/md/bcache/extents.c | 219 |
4 files changed, 322 insertions, 60 deletions
diff --git a/drivers/md/bcache/bkey.h b/drivers/md/bcache/bkey.h index 1466e890dad1..acfdd813203b 100644 --- a/drivers/md/bcache/bkey.h +++ b/drivers/md/bcache/bkey.h @@ -398,6 +398,14 @@ void bkey_unpack(struct bkey_i *, const struct bkey_format *, bool bkey_pack(struct bkey_packed *, const struct bkey_i *, const struct bkey_format *); +static inline u64 bkey_field_max(const struct bkey_format *f, + enum bch_bkey_fields nr) +{ + return f->bits_per_field[nr] < 64 + ? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr]) + : U64_MAX; +} + /* Disassembled bkeys */ static inline struct bkey_s_c bkey_disassemble(const struct bkey_format *f, diff --git a/drivers/md/bcache/bkey_methods.c b/drivers/md/bcache/bkey_methods.c index 9755cc60557f..3bcd0e041b03 100644 --- a/drivers/md/bcache/bkey_methods.c +++ b/drivers/md/bcache/bkey_methods.c @@ -31,9 +31,9 @@ const char *bkey_invalid(struct cache_set *c, enum bkey_type type, switch (k.k->type) { case KEY_TYPE_DELETED: + case KEY_TYPE_DISCARD: return NULL; - case KEY_TYPE_DISCARD: case KEY_TYPE_ERROR: return bkey_val_bytes(k.k) != 0 ? "value size should be zero" diff --git a/drivers/md/bcache/btree_io.c b/drivers/md/bcache/btree_io.c index 177027879a67..c8b88d16e21c 100644 --- a/drivers/md/bcache/btree_io.c +++ b/drivers/md/bcache/btree_io.c @@ -16,15 +16,23 @@ #include <trace/events/bcache.h> -static void verify_no_dups(struct btree_keys *b, +static void verify_no_dups(struct btree *b, struct bkey_packed *start, struct bkey_packed *end) { #ifdef CONFIG_BCACHE_DEBUG + const struct bkey_format *f = &b->keys.format; struct bkey_packed *k; - for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) - BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0); + for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) { + struct bkey l = bkey_unpack_key(f, k); + struct bkey r = bkey_unpack_key(f, bkey_next(k)); + + BUG_ON(btree_node_is_extents(b) + ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 + : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); + //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0); + } #endif } @@ -177,10 +185,80 @@ static unsigned sort_key_whiteouts(struct bkey_packed *dst, return (u64 *) out - (u64 *) dst; } +static inline int sort_extent_whiteouts_cmp(struct btree_keys *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + struct bkey ul = bkey_unpack_key(&b->format, l); + struct bkey ur = bkey_unpack_key(&b->format, r); + + return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); +} + static unsigned sort_extent_whiteouts(struct bkey_packed *dst, struct sort_iter *iter) { - struct bkey_packed *out = dst; + const struct bkey_format *f = &iter->b->format; + struct bkey_packed *in, *out = dst; + struct bkey_i l, r; + bool prev = false, l_packed; + u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); + u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); + u64 new_size; + + max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); + + sort_iter_sort(iter, sort_extent_whiteouts_cmp); + + while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { + EBUG_ON(bkeyp_val_u64s(f, in)); + EBUG_ON(in->type != KEY_TYPE_DISCARD); + + r.k = bkey_unpack_key(f, in); + + if (prev && + bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { + if (bkey_cmp(l.k.p, r.k.p) >= 0) + continue; + + new_size = l_packed + ? min(max_packed_size, max_packed_offset - + bkey_start_offset(&l.k)) + : KEY_SIZE_MAX; + + new_size = min(new_size, r.k.p.offset - + bkey_start_offset(&l.k)); + + BUG_ON(new_size < l.k.size); + + bch_key_resize(&l.k, new_size); + + if (bkey_cmp(l.k.p, r.k.p) >= 0) + continue; + + bch_cut_front(l.k.p, &r); + } + + if (prev) { + if (!bkey_pack(out, &l, f)) { + BUG_ON(l_packed); + bkey_copy(out, &l); + } + out = bkey_next(out); + } + + l = r; + prev = true; + l_packed = bkey_packed(in); + } + + if (prev) { + if (!bkey_pack(out, &l, f)) { + BUG_ON(l_packed); + bkey_copy(out, &l); + } + out = bkey_next(out); + } return (u64 *) out - (u64 *) dst; } @@ -224,9 +302,6 @@ static bool __compact_whiteouts(struct cache_set *c, struct btree *b, unsigned order, whiteout_u64s = 0, u64s; bool used_mempool, compacting = false; - if (btree_node_is_extents(b)) - return false; - for_each_bset(&b->keys, t) whiteout_u64s += should_compact_bset(b, t, whiteout_u64s != 0, mode); @@ -324,8 +399,18 @@ static bool __compact_whiteouts(struct cache_set *c, struct btree *b, : sort_key_whiteouts(unwritten_whiteouts_start(c, b), &sort_iter); - BUG_ON(u64s != b->whiteout_u64s); - verify_no_dups(&b->keys, + BUG_ON(u64s > b->whiteout_u64s); + BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b)); + BUG_ON(u_pos != whiteouts && !u64s); + + if (u64s != b->whiteout_u64s) { + void *src = unwritten_whiteouts_start(c, b); + + b->whiteout_u64s = u64s; + memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); + } + + verify_no_dups(b, unwritten_whiteouts_start(c, b), unwritten_whiteouts_end(c, b)); @@ -455,7 +540,8 @@ static unsigned sort_extents(struct bkey_packed *dst, if (bkey_deleted(in)) continue; - if (filter_whiteouts && bkey_whiteout(in)) + if (bkey_whiteout(in) && + (filter_whiteouts || !in->needs_whiteout)) continue; bkey_copy(out, in); @@ -804,7 +890,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, unsigned *whiteout_u64s) { struct bkey_format *f = &b->keys.format; - struct bkey_packed *k; + struct bkey_packed *k, *prev = NULL; bool seen_non_whiteout = false; if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) @@ -816,6 +902,11 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, if (i != &b->data->keys && !i->u64s) btree_node_error(b, c, ptr, "empty set"); + if (!BSET_SEPARATE_WHITEOUTS(i)) { + seen_non_whiteout = true; + whiteout_u64s = 0; + } + for (k = i->start; k != bset_bkey_last(i);) { struct bkey_s_c u; @@ -861,7 +952,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), u); btree_node_error(b, c, ptr, - "invalid bkey %s", buf); + "invalid bkey %s: %s", buf, invalid); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), @@ -869,13 +960,21 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, continue; } - if (BSET_SEPARATE_WHITEOUTS(i) && - !seen_non_whiteout && - !bkey_whiteout(k)) { + /* + * with the separate whiteouts thing (used for extents), the + * second set of keys actually can have whiteouts too, so we + * can't solely go off bkey_whiteout()... + */ + + if (!seen_non_whiteout && + (!bkey_whiteout(k) || + (prev && bkey_cmp_left_packed(f, prev, + bkey_start_pos(u.k)) > 0))) { *whiteout_u64s = k->_data - i->_data; seen_non_whiteout = true; } + prev = k; k = bkey_next(k); } @@ -1265,10 +1364,6 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b, : sizeof(struct btree_node_entry); bytes += b->whiteout_u64s * sizeof(u64); - sort_iter_add(&sort_iter, - unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b)); - b->whiteout_u64s = 0; for_each_bset(&b->keys, t) { i = t->data; @@ -1295,11 +1390,27 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b, } i->journal_seq = cpu_to_le64(seq); + i->u64s = 0; + + if (!btree_node_is_extents(b)) { + sort_iter_add(&sort_iter, + unwritten_whiteouts_start(c, b), + unwritten_whiteouts_end(c, b)); + SET_BSET_SEPARATE_WHITEOUTS(i, false); + } else { + memcpy_u64s(i->start, + unwritten_whiteouts_start(c, b), + b->whiteout_u64s); + i->u64s = cpu_to_le16(b->whiteout_u64s); + SET_BSET_SEPARATE_WHITEOUTS(i, true); + } + + b->whiteout_u64s = 0; u64s = btree_node_is_extents(b) - ? sort_extents(i->start, &sort_iter, false) + ? sort_extents(bset_bkey_last(i), &sort_iter, false) : sort_keys(i->start, &sort_iter, false); - i->u64s = cpu_to_le16(u64s); + le16_add_cpu(&i->u64s, u64s); clear_needs_whiteout(i); diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 625fcf1f28b6..4a13d90d7e57 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -1048,6 +1048,10 @@ struct extent_insert_state { struct btree_insert_entry *insert; struct bpos committed; struct bucket_stats_cache_set stats; + + /* for deleting: */ + struct bkey_i whiteout; + bool do_journal; }; static enum btree_insert_ret @@ -1118,51 +1122,56 @@ drop_deleted_keys: static void extent_insert_committed(struct extent_insert_state *s) { struct cache_set *c = s->trans->c; - struct bkey_i *insert = s->insert->k; struct btree_iter *iter = s->insert->iter; + struct bkey_i *insert = !bkey_whiteout(&s->insert->k->k) + ? s->insert->k + : &s->whiteout; + BKEY_PADDED(k) split; - EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos)); EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0); - EBUG_ON(bkey_cmp(s->committed, iter->pos) < 0); + EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0); - if (bkey_cmp(s->committed, iter->pos) > 0) { - BKEY_PADDED(k) split; + if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k))) + return; - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + if (bkey_whiteout(&s->insert->k->k) && !s->do_journal) { + bch_cut_front(s->committed, insert); + goto done; + } - bkey_copy(&split.k, insert); + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && - bkey_cmp(s->committed, insert->k.p) && - bkey_extent_is_compressed(c, bkey_i_to_s_c(insert))) { - /* XXX: possibly need to increase our reservation? */ - bch_cut_subtract_back(iter, s->committed, - bkey_i_to_s(&split.k), &s->stats); - bch_cut_front(s->committed, insert); - bch_add_sectors(iter, bkey_i_to_s_c(insert), - bkey_start_offset(&insert->k), - insert->k.size, &s->stats); - } else { - bch_cut_back(s->committed, &split.k.k); - bch_cut_front(s->committed, insert); - } + bkey_copy(&split.k, insert); - if (debug_check_bkeys(c)) - bkey_debugcheck(c, iter->nodes[iter->level], - bkey_i_to_s_c(&split.k)); + if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && + bkey_cmp(s->committed, insert->k.p) && + bkey_extent_is_compressed(c, bkey_i_to_s_c(insert))) { + /* XXX: possibly need to increase our reservation? */ + bch_cut_subtract_back(iter, s->committed, + bkey_i_to_s(&split.k), &s->stats); + bch_cut_front(s->committed, insert); + bch_add_sectors(iter, bkey_i_to_s_c(insert), + bkey_start_offset(&insert->k), + insert->k.size, &s->stats); + } else { + bch_cut_back(s->committed, &split.k.k); + bch_cut_front(s->committed, insert); + } - /* - * XXX: if this is a delete, and we didn't overlap with - * anything, we don't need to journal this: - */ - bch_btree_journal_key(s->trans, iter, &split.k); + if (debug_check_bkeys(c)) + bkey_debugcheck(c, iter->nodes[iter->level], + bkey_i_to_s_c(&split.k)); - extent_bset_insert(c, iter, &split.k); + bch_btree_journal_key(s->trans, iter, &split.k); - bch_btree_iter_set_pos_same_leaf(iter, s->committed); + if (!bkey_whiteout(&split.k.k)) + extent_bset_insert(c, iter, &split.k); +done: + bch_btree_iter_set_pos_same_leaf(iter, s->committed); - s->trans->did_work = true; - } + insert->k.needs_whiteout = false; + s->do_journal = false; + s->trans->did_work = true; } static enum extent_insert_hook_ret @@ -1251,8 +1260,7 @@ extent_insert_check_split_compressed(struct extent_insert_state *s, struct cache_set *c = s->trans->c; unsigned sectors; - if (k.k->size && - overlap == BCH_EXTENT_OVERLAP_MIDDLE && + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && (sectors = bkey_extent_is_compressed(c, k))) { int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD; @@ -1333,9 +1341,9 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, extent_save(&b->keys, node_iter, _k, k.k); if (extent_insert_advance_pos(s, k.s_c) == - BTREE_HOOK_RESTART_TRANS) { + BTREE_HOOK_RESTART_TRANS) return BTREE_INSERT_NEED_TRAVERSE; - } + extent_insert_committed(s); /* * We split and inserted upto at k.k->p - that @@ -1369,6 +1377,8 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, * what k points to) */ bkey_reassemble(&split.k, k.s_c); + split.k.k.needs_whiteout |= bset_written(b, t->data); + bch_cut_back(bkey_start_pos(&insert->k), &split.k.k); BUG_ON(bkey_deleted(&split.k.k)); @@ -1387,6 +1397,121 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, return BTREE_INSERT_OK; } +static enum btree_insert_ret +bch_delete_fixup_extent(struct extent_insert_state *s) +{ + struct cache_set *c = s->trans->c; + struct btree_iter *iter = s->insert->iter; + struct btree *b = iter->nodes[0]; + struct btree_node_iter *node_iter = &iter->node_iters[0]; + const struct bkey_format *f = &b->keys.format; + struct bkey_packed *_k; + struct bkey unpacked; + struct bkey_i *insert = s->insert->k; + enum btree_insert_ret ret = BTREE_INSERT_OK; + + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); + + s->whiteout = *insert; + s->do_journal = false; + + while (bkey_cmp(s->committed, insert->k.p) < 0 && + (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && + (_k = bch_btree_node_iter_peek_all(node_iter, &b->keys))) { + struct bset_tree *t = bch_bkey_to_bset(&b->keys, _k); + struct bkey_s k = __bkey_disassemble(f, _k, &unpacked); + enum bch_extent_overlap overlap; + + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); + EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); + + if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) + break; + + if (bkey_whiteout(k.k)) { + s->committed = bpos_min(insert->k.p, k.k->p); + goto next; + } + + overlap = bch_extent_overlap(&insert->k, k.k); + + ret = extent_insert_check_split_compressed(s, k.s_c, overlap); + if (ret != BTREE_INSERT_OK) + goto stop; + + switch (extent_insert_advance_pos(s, k.s_c)) { + case BTREE_HOOK_DO_INSERT: + break; + case BTREE_HOOK_NO_INSERT: + continue; + case BTREE_HOOK_RESTART_TRANS: + ret = BTREE_INSERT_NEED_TRAVERSE; + goto stop; + } + + s->do_journal = true; + + if (overlap == BCH_EXTENT_OVERLAP_ALL) { + btree_keys_account_key_drop(&b->keys.nr, + t - b->keys.set, _k); + bch_subtract_sectors(iter, k.s_c, + bkey_start_offset(k.k), k.k->size, + &s->stats); + _k->type = KEY_TYPE_DISCARD; + reserve_whiteout(b, t, _k); + } else if (k.k->needs_whiteout || + bset_written(b, t->data)) { + struct bkey_i discard = *insert; + + switch (overlap) { + case BCH_EXTENT_OVERLAP_FRONT: + bch_cut_front(bkey_start_pos(k.k), &discard); + break; + case BCH_EXTENT_OVERLAP_BACK: + bch_cut_back(k.k->p, &discard.k); + break; + default: + break; + } + + discard.k.needs_whiteout = true; + + ret = extent_squash(s, insert, t, _k, k, overlap); + BUG_ON(ret != BTREE_INSERT_OK); + + extent_bset_insert(c, iter, &discard); + } else { + ret = extent_squash(s, insert, t, _k, k, overlap); + BUG_ON(ret != BTREE_INSERT_OK); + } +next: + bch_cut_front(s->committed, insert); + bch_btree_iter_set_pos_same_leaf(iter, s->committed); + } + + if (bkey_cmp(s->committed, insert->k.p) < 0 && + ret == BTREE_INSERT_OK && + extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS) + ret = BTREE_INSERT_NEED_TRAVERSE; +stop: + extent_insert_committed(s); + + bch_cache_set_stats_apply(c, &s->stats, s->trans->disk_res, + gc_pos_btree_node(b)); + + EBUG_ON(bkey_cmp(iter->pos, s->committed)); + EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf); + + bch_cut_front(iter->pos, insert); + + if (insert->k.size && iter->at_end_of_leaf) + ret = BTREE_INSERT_NEED_TRAVERSE; + + EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK); + + return ret; +} + /** * bch_extent_insert_fixup - insert a new extent and deal with overlaps * @@ -1448,6 +1573,9 @@ bch_insert_fixup_extent(struct btree_insert *trans, EBUG_ON(iter->level); EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size); + if (bkey_whiteout(&insert->k->k)) + return bch_delete_fixup_extent(&s); + /* * As we process overlapping extents, we advance @iter->pos both to * signal to our caller (btree_insert_key()) how much of @insert->k has @@ -1497,8 +1625,21 @@ bch_insert_fixup_extent(struct btree_insert *trans, ret = BTREE_INSERT_NEED_TRAVERSE; goto stop; } + + if (k.k->size && + (k.k->needs_whiteout || bset_written(b, t->data))) + insert->k->k.needs_whiteout = true; + + if (overlap == BCH_EXTENT_OVERLAP_ALL && + bkey_whiteout(k.k) && + k.k->needs_whiteout) { + unreserve_whiteout(b, t, _k); + _k->needs_whiteout = false; + } squash: - extent_squash(&s, insert->k, t, _k, k, overlap); + ret = extent_squash(&s, insert->k, t, _k, k, overlap); + if (ret != BTREE_INSERT_OK) + goto stop; } if (bkey_cmp(s.committed, insert->k->k.p) < 0 && @@ -2146,6 +2287,8 @@ static enum merge_result bch_extent_merge(struct cache_set *c, return BCH_MERGE_NOMERGE; } + l->k.needs_whiteout |= r->k.needs_whiteout; + /* Keys with no pointers aren't restricted to one bucket and could * overflow KEY_SIZE */ |