diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-11-17 11:42:17 -0900 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-01-18 21:41:08 -0900 |
commit | b637fe1816a0109680a3b62b77997ac23e55d821 (patch) | |
tree | 56b3747c621525c411fd59d2bed1cb98b7131a04 | |
parent | d0844f5e00edab7a90a0ddcf612e5e76c458b9f4 (diff) |
bcache: Move unwritten whiteouts to a separate area
-rw-r--r-- | drivers/md/bcache/bkey.c | 8 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 6 | ||||
-rw-r--r-- | drivers/md/bcache/bset.h | 6 | ||||
-rw-r--r-- | drivers/md/bcache/btree_cache.c | 16 | ||||
-rw-r--r-- | drivers/md/bcache/btree_io.c | 548 | ||||
-rw-r--r-- | drivers/md/bcache/btree_io.h | 7 | ||||
-rw-r--r-- | drivers/md/bcache/btree_types.h | 11 | ||||
-rw-r--r-- | drivers/md/bcache/btree_update.c | 92 | ||||
-rw-r--r-- | drivers/md/bcache/btree_update.h | 84 | ||||
-rw-r--r-- | drivers/md/bcache/io_types.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/bcache.h | 23 |
11 files changed, 622 insertions, 183 deletions
diff --git a/drivers/md/bcache/bkey.c b/drivers/md/bcache/bkey.c index 4f1f5317ce43..cf46ef55e0ed 100644 --- a/drivers/md/bcache/bkey.c +++ b/drivers/md/bcache/bkey.c @@ -232,8 +232,12 @@ static bool bch_bkey_transform_key(const struct bkey_format *out_f, if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) return false; + /* Can't happen because the val would be too big to unpack: */ + EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); + pack_state_finish(&out_s, out); out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; + out->needs_whiteout = in->needs_whiteout; out->type = in->type; return true; @@ -262,9 +266,11 @@ static struct bkey __bkey_unpack_key(const struct bkey_format *format, EBUG_ON(format->nr_fields != 5); EBUG_ON(in->u64s < format->key_u64s); EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); + EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); out.u64s = BKEY_U64s + in->u64s - format->key_u64s; out.format = KEY_FORMAT_CURRENT; + out.needs_whiteout = in->needs_whiteout; out.type = in->type; out.pad[0] = 0; out.p.inode = get_inc_field(&state, BKEY_FIELD_INODE); @@ -301,6 +307,7 @@ bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in, { struct pack_state state = pack_state_init(format, out); + EBUG_ON((void *) in == (void *) out); EBUG_ON(format->nr_fields != 5); EBUG_ON(in->format != KEY_FORMAT_CURRENT); @@ -323,6 +330,7 @@ bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in, pack_state_finish(&state, out); out->u64s = format->key_u64s + in->u64s - BKEY_U64s; out->format = KEY_FORMAT_LOCAL_BTREE; + out->needs_whiteout = in->needs_whiteout; out->type = in->type; bch_bkey_pack_verify(out, in, format); diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f06b5eeb5d7e..f993a508e84c 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -202,7 +202,8 @@ void bch_btree_node_iter_verify(struct btree_node_iter *iter, if (bch_btree_node_iter_bset_pos(iter, b, t->data) == bset_bkey_last(t->data) && (k = bkey_prev_all(t, bset_bkey_last(t->data)))) - BUG_ON(__btree_node_iter_cmp(iter, b, k, first) > 0); + BUG_ON(__btree_node_iter_cmp(iter->is_extents, b, + k, first) > 0); } void bch_verify_key_order(struct btree_keys *b, @@ -1557,7 +1558,8 @@ struct bkey_packed *bch_btree_node_iter_prev_all(struct btree_node_iter *iter, k = bkey_prev_all(t, bch_btree_node_iter_bset_pos(iter, b, t->data)); if (k && - (!prev || __btree_node_iter_cmp(iter, b, k, prev) > 0)) { + (!prev || __btree_node_iter_cmp(iter->is_extents, b, + k, prev) > 0)) { prev = k; prev_i = t->data; } diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 361c8dafd39a..32c715cfea16 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -460,7 +460,7 @@ __btree_node_offset_to_key(struct btree_keys *b, u16 k) return (void *) ((u64 *) b->set->data + k); } -static inline int __btree_node_iter_cmp(struct btree_node_iter *iter, +static inline int __btree_node_iter_cmp(bool is_extents, struct btree_keys *b, struct bkey_packed *l, struct bkey_packed *r) @@ -473,7 +473,7 @@ static inline int __btree_node_iter_cmp(struct btree_node_iter *iter, * For extents, bkey_deleted() is used as a proxy for k->size == 0, so * deleted keys have to sort last. */ - return bkey_cmp_packed(&b->format, l, r) ?: iter->is_extents + return bkey_cmp_packed(&b->format, l, r) ?: is_extents ? (int) bkey_deleted(l) - (int) bkey_deleted(r) : (int) bkey_deleted(r) - (int) bkey_deleted(l); } @@ -483,7 +483,7 @@ static inline int btree_node_iter_cmp(struct btree_node_iter *iter, struct btree_node_iter_set l, struct btree_node_iter_set r) { - return __btree_node_iter_cmp(iter, b, + return __btree_node_iter_cmp(iter->is_extents, b, __btree_node_offset_to_key(b, l.k), __btree_node_offset_to_key(b, r.k)); } diff --git a/drivers/md/bcache/btree_cache.c b/drivers/md/bcache/btree_cache.c index 42c2609db4b0..49d87cccaefe 100644 --- a/drivers/md/bcache/btree_cache.c +++ b/drivers/md/bcache/btree_cache.c @@ -166,7 +166,7 @@ static int mca_reap_notrace(struct cache_set *c, struct btree *b, bool flush) * after the write, since this node is about to be evicted: */ if (btree_node_dirty(b)) - __bch_btree_node_write(c, b, NULL, -1); + __bch_btree_node_write(c, b, NULL, SIX_LOCK_read, -1); /* wait for any in flight btree write */ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, @@ -507,12 +507,14 @@ out_unlock: list_del_init(&b->list); mutex_unlock(&c->btree_cache_lock); out: - b->flags = 0; - b->written = 0; - b->keys.nsets = 0; - b->keys.set[0].data = NULL; - b->sib_u64s[0] = 0; - b->sib_u64s[1] = 0; + b->flags = 0; + b->written = 0; + b->keys.nsets = 0; + b->keys.set[0].data = NULL; + b->sib_u64s[0] = 0; + b->sib_u64s[1] = 0; + b->whiteout_u64s = 0; + b->uncompacted_whiteout_u64s = 0; bch_btree_keys_init(&b->keys, &c->expensive_debug_checks); bch_time_stats_update(&c->mca_alloc_time, start_time); diff --git a/drivers/md/bcache/btree_io.c b/drivers/md/bcache/btree_io.c index a467fd4c4421..4b5c4de6b256 100644 --- a/drivers/md/bcache/btree_io.c +++ b/drivers/md/bcache/btree_io.c @@ -16,6 +16,34 @@ #include <trace/events/bcache.h> +static void verify_no_dups(struct btree_keys *b, + struct bkey_packed *start, + struct bkey_packed *end) +{ +#ifdef CONFIG_BCACHE_DEBUG + struct bkey_packed *k; + + for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) + BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0); +#endif +} + +static void clear_needs_whiteout(struct bset *i) +{ + struct bkey_packed *k; + + for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k)) + k->needs_whiteout = false; +} + +static void set_needs_whiteout(struct bset *i) +{ + struct bkey_packed *k; + + for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k)) + k->needs_whiteout = true; +} + static void btree_bounce_free(struct cache_set *c, unsigned order, bool used_mempool, void *p) { @@ -127,101 +155,280 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, return ret; } -bool bch_maybe_compact_whiteouts(struct cache_set *c, struct btree *b) +static inline int sort_key_whiteouts_cmp(struct btree_keys *b, + struct bkey_packed *l, + struct bkey_packed *r) { - struct bset_tree *t, *rebuild_from = NULL; + return bkey_cmp_packed(&b->format, l, r); +} + +static unsigned sort_key_whiteouts(struct bkey_packed *dst, + struct sort_iter *iter) +{ + struct bkey_packed *in, *out = dst; + + sort_iter_sort(iter, sort_key_whiteouts_cmp); + + while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) { + bkey_copy(out, in); + out = bkey_next(out); + } + + return (u64 *) out - (u64 *) dst; +} + +static unsigned sort_extent_whiteouts(struct bkey_packed *dst, + struct sort_iter *iter) +{ + struct bkey_packed *out = dst; + + return (u64 *) out - (u64 *) dst; +} + +enum compact_mode { + COMPACT_LAZY, + COMPACT_WRITTEN, + COMPACT_WRITTEN_NO_WRITE_LOCK, +}; + +static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, + bool compacting, + enum compact_mode mode) +{ + unsigned live_u64s = b->keys.nr.bset_u64s[t - b->keys.set]; + unsigned bset_u64s = le16_to_cpu(t->data->u64s); + + if (live_u64s == bset_u64s) + return 0; + + if (mode == COMPACT_LAZY) { + if (live_u64s * 4 < bset_u64s * 3 || + (compacting && bset_unwritten(b, t->data))) + return bset_u64s - live_u64s; + } else { + if (bset_written(b, t->data)) + return bset_u64s - live_u64s; + } + + return 0; +} + +static bool __compact_whiteouts(struct cache_set *c, struct btree *b, + enum compact_mode mode) +{ + const struct bkey_format *f = &b->keys.format; + struct bset_tree *t; + struct bkey_packed *whiteouts = NULL; + struct bkey_packed *u_start, *u_pos; + struct sort_iter sort_iter; + unsigned order, whiteout_u64s = 0, u64s; + bool used_mempool, compacting = false; + + if (btree_node_is_extents(b)) + return false; + + for_each_bset(&b->keys, t) + whiteout_u64s += should_compact_bset(b, t, + whiteout_u64s != 0, mode); + + if (!whiteout_u64s) + return false; + + sort_iter_init(&sort_iter, &b->keys); + + whiteout_u64s += b->whiteout_u64s; + order = get_order(whiteout_u64s * sizeof(u64)); + + whiteouts = btree_bounce_alloc(c, order, &used_mempool); + u_start = u_pos = whiteouts; + + memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), + b->whiteout_u64s); + u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); + + sort_iter_add(&sort_iter, u_start, u_pos); for_each_bset(&b->keys, t) { struct bset *i = t->data; - struct bkey_packed *k, *n, *out = i->start; - unsigned live_u64s = b->keys.nr.bset_u64s[t - b->keys.set]; + struct bkey_packed *k, *n, *out, *start, *end; + struct btree_node_entry *src = NULL, *dst = NULL; - if (live_u64s * 4 > le16_to_cpu(i->u64s) * 3) - continue; + if (t != b->keys.set && bset_unwritten(b, i)) { + src = container_of(i, struct btree_node_entry, keys); + dst = max(write_block(b), + (void *) bset_bkey_last(t[-1].data)); + } - if (!bset_written(b, i)) + if (!should_compact_bset(b, t, compacting, mode)) { + if (src != dst) { + memmove(dst, src, sizeof(*src) + + le16_to_cpu(src->keys.u64s) * + sizeof(u64)); + t->data = &dst->keys; + } continue; + } + + compacting = true; + u_start = u_pos; + start = i->start; + end = bset_bkey_last(i); - for (k = i->start; k != bset_bkey_last(i); k = n) { + if (src != dst) { + src = container_of(i, struct btree_node_entry, keys); + dst = max(write_block(b), + (void *) bset_bkey_last(t[-1].data)); + + memmove(dst, src, sizeof(*src)); + i = t->data = &dst->keys; + } + + out = i->start; + + for (k = start; k != end; k = n) { n = bkey_next(k); - if (!bkey_whiteout(k)) { - bkey_copy(out, k); - out = bkey_next(out); - } else if (!bset_written(b, i)) { - /* - * We cannot drop whiteouts that haven't been - * written out - we can drop the value, though: - * - * XXX we could drop 0 size extents, if we fix - * assertions elsewhere - */ - set_bkeyp_val_u64s(&b->keys.format, k, 0); + if (bkey_deleted(k) && btree_node_is_extents(b)) + continue; + + if (bkey_whiteout(k) && !k->needs_whiteout) + continue; + + if (bkey_whiteout(k)) { + unreserve_whiteout(b, t, k); + memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); + set_bkeyp_val_u64s(f, u_pos, 0); + u_pos = bkey_next(u_pos); + } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { bkey_copy(out, k); out = bkey_next(out); } } - i->u64s = cpu_to_le16((u64 *) out - i->_data); - bch_bset_set_no_aux_tree(&b->keys, t); - if (!rebuild_from) - rebuild_from = t; + sort_iter_add(&sort_iter, u_start, u_pos); + + if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { + i->u64s = cpu_to_le16((u64 *) out - i->_data); + bch_bset_set_no_aux_tree(&b->keys, t); + } } - if (!rebuild_from) - return false; + b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; - for (t = rebuild_from; t < b->keys.set + b->keys.nsets; t++) { - /* - * Compacting a set that hasn't been written out yet might mean - * we need to shift the set after it down: - */ - if (!bset_written(b, t->data) && - t != bset_tree_last(&b->keys)) { - struct btree_node_entry *dst = (void *) b->data + - (bset_end_sector(c, b, t->data) << 9); - struct btree_node_entry *src = container_of(t[1].data, - struct btree_node_entry, keys); - - if (dst != src) { - memmove(dst, src, - (void *) bset_bkey_last(&src->keys) - - (void *) src); - t[1].data = &dst->keys; + BUG_ON((void *) unwritten_whiteouts_start(c, b) < + (void *) bset_bkey_last(btree_bset_last(b))); + + u64s = btree_node_is_extents(b) + ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b), + &sort_iter) + : sort_key_whiteouts(unwritten_whiteouts_start(c, b), + &sort_iter); + + BUG_ON(u64s != b->whiteout_u64s); + verify_no_dups(&b->keys, + unwritten_whiteouts_start(c, b), + unwritten_whiteouts_end(c, b)); + + btree_bounce_free(c, order, used_mempool, whiteouts); + + if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) + bch_btree_build_aux_trees(b); + + bch_btree_keys_u64s_remaining(c, b); + bch_verify_btree_nr_keys(&b->keys); + + return true; +} + +bool bch_maybe_compact_whiteouts(struct cache_set *c, struct btree *b) +{ + return __compact_whiteouts(c, b, COMPACT_LAZY); +} + +static bool bch_drop_whiteouts(struct btree *b) +{ + struct bset_tree *t; + bool ret = false; + + for_each_bset(&b->keys, t) { + struct bset *i = t->data; + struct bkey_packed *k, *n, *out, *start, *end; + + if (!should_compact_bset(b, t, true, true)) + continue; + + start = i->start; + end = bset_bkey_last(i); + + if (bset_unwritten(b, i) && + t != b->keys.set) { + struct bset *dst = + max_t(struct bset *, write_block(b), + (void *) bset_bkey_last(t[-1].data)); + + memmove(dst, i, sizeof(struct bset)); + i = t->data = dst; + } + + out = i->start; + + for (k = start; k != end; k = n) { + n = bkey_next(k); + + if (!bkey_whiteout(k)) { + bkey_copy(out, k); + out = bkey_next(out); } } + + i->u64s = cpu_to_le16((u64 *) out - i->_data); + bch_bset_set_no_aux_tree(&b->keys, t); + ret = true; } - bch_btree_build_aux_trees(b); bch_verify_btree_nr_keys(&b->keys); - return true; + + return ret; } -static inline int sort_keys_cmp(struct btree_keys *b, - struct bkey_packed *l, - struct bkey_packed *r) +static int sort_keys_cmp(struct btree_keys *b, + struct bkey_packed *l, + struct bkey_packed *r) { return bkey_cmp_packed(&b->format, l, r) ?: - (int) bkey_whiteout(r) - (int) bkey_whiteout(l); + (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?: + (int) l->needs_whiteout - (int) r->needs_whiteout; } static unsigned sort_keys(struct bkey_packed *dst, struct sort_iter *iter, bool filter_whiteouts) { + const struct bkey_format *f = &iter->b->format; struct bkey_packed *in, *next, *out = dst; sort_iter_sort(iter, sort_keys_cmp); while ((in = sort_iter_next(iter, sort_keys_cmp))) { - if ((next = sort_iter_peek(iter)) && - !bkey_cmp_packed(&iter->b->format, in, next)) + if (bkey_whiteout(in) && + (filter_whiteouts || !in->needs_whiteout)) continue; - if (filter_whiteouts && bkey_whiteout(in)) + if (bkey_whiteout(in) && + (next = sort_iter_peek(iter)) && + !bkey_cmp_packed(f, in, next)) { + BUG_ON(in->needs_whiteout && + next->needs_whiteout); + next->needs_whiteout |= in->needs_whiteout; continue; + } - bkey_copy(out, in); + if (bkey_whiteout(in)) { + memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); + set_bkeyp_val_u64s(f, out, 0); + } else { + bkey_copy(out, in); + } out = bkey_next(out); } @@ -262,14 +469,14 @@ static void btree_node_sort(struct cache_set *c, struct btree *b, struct btree_iter *iter, unsigned start_idx, unsigned end_idx, - bool is_write_locked) + bool filter_whiteouts) { struct btree_node *out; struct sort_iter sort_iter; struct bset_tree *t; - bool used_mempool = false, filter_whiteouts; + bool used_mempool = false; u64 start_time; - unsigned i, u64s, order, shift = end_idx - start_idx - 1; + unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; bool sorting_entire_node = start_idx == 0 && end_idx == b->keys.nsets; @@ -291,7 +498,8 @@ static void btree_node_sort(struct cache_set *c, struct btree *b, start_time = local_clock(); - filter_whiteouts = bset_written(b, b->keys.set[start_idx].data); + if (btree_node_is_extents(b)) + filter_whiteouts = bset_written(b, b->keys.set[start_idx].data); u64s = btree_node_is_extents(b) ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts) @@ -305,9 +513,6 @@ static void btree_node_sort(struct cache_set *c, struct btree *b, if (sorting_entire_node) bch_time_stats_update(&c->btree_sort_time, start_time); - if (!is_write_locked) - __btree_node_lock_write(b, iter); - /* Make sure we preserve bset journal_seq: */ for (t = b->keys.set + start_idx + 1; t < b->keys.set + end_idx; @@ -353,9 +558,6 @@ static void btree_node_sort(struct cache_set *c, struct btree *b, bch_bset_set_no_aux_tree(&b->keys, &b->keys.set[start_idx]); - if (!is_write_locked) - __btree_node_unlock_write(b, iter); - btree_bounce_free(c, order, used_mempool, out); bch_verify_btree_nr_keys(&b->keys); @@ -513,7 +715,7 @@ static bool btree_node_compact(struct cache_set *c, struct btree *b, for (unwritten_idx = 0; unwritten_idx < b->keys.nsets; unwritten_idx++) - if (!bset_written(b, b->keys.set[unwritten_idx].data)) + if (bset_unwritten(b, b->keys.set[unwritten_idx].data)) break; if (b->keys.nsets - unwritten_idx > 1) { @@ -552,39 +754,23 @@ void bch_btree_build_aux_trees(struct btree *b) void bch_btree_init_next(struct cache_set *c, struct btree *b, struct btree_iter *iter) { - struct bset *i; + struct btree_node_entry *bne; bool did_sort; - unsigned offset; - BUG_ON(iter && iter->nodes[b->level] != b); + EBUG_ON(!(b->lock.state.seq & 1)); + EBUG_ON(iter && iter->nodes[b->level] != b); did_sort = btree_node_compact(c, b, iter); - /* do verify if we sorted down to a single set: */ if (did_sort && b->keys.nsets == 1) bch_btree_verify(c, b); - __btree_node_lock_write(b, iter); - - /* - * If the last bset has been written, or if it's gotten too big - start - * a new bset to insert into: - */ - i = btree_bset_last(b); - offset = next_bset_offset(c, b); - - if (offset < c->sb.btree_node_size && - (bset_written(b, i) || - le16_to_cpu(i->u64s) * sizeof(u64) > BTREE_WRITE_SET_BUFFER)) { - struct btree_node_entry *bne = (void *) b->data + (offset << 9); - + bne = want_new_bset(c, b); + if (bne) bch_bset_init_next(&b->keys, &bne->keys); - } bch_btree_build_aux_trees(b); - __btree_node_unlock_write(b, iter); - if (iter && did_sort) bch_btree_iter_reinit_node(iter, b); } @@ -839,6 +1025,8 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b, bch_bset_build_aux_tree(&b->keys, b->keys.set, false); + set_needs_whiteout(b->keys.set->data); + btree_node_reset_sib_u64s(b); err = "short btree key"; @@ -981,7 +1169,10 @@ static void btree_node_write_endio(struct bio *bio) set_btree_node_write_error(b); if (wbio->bounce) - bch_bio_free_pages_pool(c, bio); + btree_bounce_free(c, + wbio->order, + wbio->used_mempool, + page_address(bio->bi_io_vec[0].bv_page)); if (wbio->put_bio) bio_put(bio); @@ -1000,17 +1191,23 @@ static void btree_node_write_endio(struct bio *bio) void __bch_btree_node_write(struct cache_set *c, struct btree *b, struct closure *parent, + enum six_lock_type lock_type_held, int idx_to_write) { struct bio *bio; struct bch_write_bio *wbio; struct bset_tree *t; - struct bset *last_set_to_write = NULL; + struct bset *i; + struct btree_node *bn = NULL; + struct btree_node_entry *bne = NULL; BKEY_PADDED(key) k; struct bkey_s_extent e; struct bch_extent_ptr *ptr; struct cache *ca; - size_t sectors_to_write; + struct sort_iter sort_iter; + unsigned sectors_to_write, order, bytes, u64s; + u64 seq = 0; + bool used_mempool; void *data; /* @@ -1025,6 +1222,8 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b, btree_node_io_lock(b); + set_btree_node_just_written(b); + BUG_ON(!list_empty(&b->write_blocked)); #if 0 /* @@ -1050,55 +1249,81 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b, change_bit(BTREE_NODE_write_idx, &b->flags); - for_each_bset(&b->keys, t) - if (!bset_written(b, t->data)) - break; - - if (t != bset_tree_last(&b->keys)) { - /* - * See if we can get a write lock and compact these bsets - * together: - */ + if (lock_type_held == SIX_LOCK_intent) { + six_lock_write(&b->lock); + __compact_whiteouts(c, b, COMPACT_WRITTEN); + six_unlock_write(&b->lock); + } else { + __compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK); } + BUG_ON(b->uncompacted_whiteout_u64s); + + sort_iter_init(&sort_iter, &b->keys); + + bytes = !b->written + ? sizeof(struct btree_node) + : sizeof(struct btree_node_entry); + + bytes += b->whiteout_u64s * sizeof(u64); + sort_iter_add(&sort_iter, + unwritten_whiteouts_start(c, b), + unwritten_whiteouts_end(c, b)); + b->whiteout_u64s = 0; + for_each_bset(&b->keys, t) { - struct bset *i = t->data; + i = t->data; if (bset_written(b, i)) continue; - BUG_ON(i->seq != b->data->keys.seq); - BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); - BUG_ON(i != btree_bset_last(b) && - bset_end_sector(c, b, i) << 9 != - bset_byte_offset(b, container_of(t[1].data, - struct btree_node_entry, keys))); + bytes += le16_to_cpu(i->u64s) * sizeof(u64); + sort_iter_add(&sort_iter, i->start, bset_bkey_last(i)); + seq = max(seq, le64_to_cpu(i->journal_seq)); + } - /* - * We may have merely initialized a bset but not written - * anything to it yet: - */ - if (i != btree_bset_first(b) && !i->u64s) - continue; + order = get_order(bytes); + data = btree_bounce_alloc(c, order, &used_mempool); - last_set_to_write = i; + if (!b->written) { + bn = data; + *bn = *b->data; + i = &bn->keys; + } else { + bne = data; + bne->keys = b->data->keys; + i = &bne->keys; + } - i->version = cpu_to_le16(BCACHE_BSET_VERSION); - SET_BSET_CSUM_TYPE(i, c->opts.metadata_checksum); + i->journal_seq = cpu_to_le64(seq); - if (t == b->keys.set) { - b->data->csum = cpu_to_le64(btree_csum_set(b, b->data)); - } else { - struct btree_node_entry *bne = container_of(i, - struct btree_node_entry, keys); + u64s = btree_node_is_extents(b) + ? sort_extents(i->start, &sort_iter, false) + : sort_keys(i->start, &sort_iter, false); + i->u64s = cpu_to_le16(u64s); - bne->csum = cpu_to_le64(btree_csum_set(b, bne)); - } + clear_needs_whiteout(i); + + if (b->written && !i->u64s) { + /* Nothing to write: */ + btree_bounce_free(c, order, used_mempool, data); + btree_node_write_done(c, b); + return; } - BUG_ON(!last_set_to_write); - sectors_to_write = bset_end_sector(c, b, last_set_to_write) - b->written; - data = write_block(b); + BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); + BUG_ON(i->seq != b->data->keys.seq); + + i->version = cpu_to_le16(BCACHE_BSET_VERSION); + SET_BSET_CSUM_TYPE(i, c->opts.metadata_checksum); + + if (bn) + bn->csum = cpu_to_le64(btree_csum_set(b, bn)); + else + bne->csum = cpu_to_le64(btree_csum_set(b, bne)); + + sectors_to_write = round_up((void *) bset_bkey_last(i) - data, + block_bytes(c)) >> 9; BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size); @@ -1119,16 +1344,20 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b, set_btree_node_write_error(b); b->written += sectors_to_write; + btree_bounce_free(c, order, used_mempool, data); btree_node_write_done(c, b); return; } - bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->bio_write); + bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write); wbio = to_wbio(bio); wbio->cl = parent; wbio->bounce = true; wbio->put_bio = true; + wbio->order = order; + wbio->used_mempool = used_mempool; + bio->bi_iter.bi_size = sectors_to_write << 9; bio->bi_end_io = btree_node_write_endio; bio->bi_private = b; bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA); @@ -1136,8 +1365,7 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b, if (parent) closure_get(parent); - bch_bio_alloc_pages_pool(c, bio, sectors_to_write << 9); - memcpy_to_bio(bio, bio->bi_iter, data); + bch_bio_map(bio, data); /* * If we're appending to a leaf node, we don't technically need FUA - @@ -1170,6 +1398,64 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b, bch_submit_wbio_replicas(wbio, c, &k.key, true); } +/* + * Work that must be done with write lock held: + */ +bool bch_btree_post_write_cleanup(struct cache_set *c, struct btree *b) +{ + bool invalidated_iter = false; + struct btree_node_entry *bne; + struct bset_tree *t; + + if (!btree_node_just_written(b)) + return false; + + BUG_ON(b->whiteout_u64s); + BUG_ON(b->uncompacted_whiteout_u64s); + + clear_btree_node_just_written(b); + + /* + * Note: immediately after write, bset_unwritten()/bset_written() don't + * work - the amount of data we had to write after compaction might have + * been smaller than the offset of the last bset. + * + * However, we know that all bsets have been written here, as long as + * we're still holding the write lock: + */ + + /* + * XXX: decide if we really want to unconditionally sort down to a + * single bset: + */ + if (b->keys.nsets > 1) { + btree_node_sort(c, b, NULL, 0, b->keys.nsets, true); + invalidated_iter = true; + } else { + invalidated_iter = bch_drop_whiteouts(b); + } + + for_each_bset(&b->keys, t) + set_needs_whiteout(t->data); + + /* + * If later we don't unconditionally sort down to a single bset, we have + * to ensure this is still true: + */ + BUG_ON((void *) bset_bkey_last(btree_bset_last(b)) > write_block(b)); + + bne = want_new_bset(c, b); + if (bne) + bch_bset_init_next(&b->keys, &bne->keys); + + bch_btree_build_aux_trees(b); + + return invalidated_iter; +} + +/* + * Use this one if the node is intent locked: + */ void bch_btree_node_write(struct cache_set *c, struct btree *b, struct closure *parent, enum six_lock_type lock_type_held, @@ -1177,15 +1463,19 @@ void bch_btree_node_write(struct cache_set *c, struct btree *b, { BUG_ON(lock_type_held == SIX_LOCK_write); - __bch_btree_node_write(c, b, parent, -1); - if (lock_type_held == SIX_LOCK_intent || six_trylock_convert(&b->lock, SIX_LOCK_read, SIX_LOCK_intent)) { - bch_btree_init_next(c, b, NULL); + __bch_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write); + + six_lock_write(&b->lock); + bch_btree_post_write_cleanup(c, b); + six_unlock_write(&b->lock); if (lock_type_held == SIX_LOCK_read) six_lock_downgrade(&b->lock); + } else { + __bch_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write); } } diff --git a/drivers/md/bcache/btree_io.h b/drivers/md/bcache/btree_io.h index 55c021acd9d9..9b66e707d4b0 100644 --- a/drivers/md/bcache/btree_io.h +++ b/drivers/md/bcache/btree_io.h @@ -37,7 +37,9 @@ void bch_btree_complete_write(struct cache_set *, struct btree *, struct btree_write *); void __bch_btree_node_write(struct cache_set *, struct btree *, - struct closure *, int); + struct closure *, enum six_lock_type, int); +bool bch_btree_post_write_cleanup(struct cache_set *, struct btree *); + void bch_btree_node_write(struct cache_set *, struct btree *, struct closure *, enum six_lock_type, int); @@ -45,7 +47,4 @@ void bch_btree_flush(struct cache_set *); void bch_btree_node_flush_journal_entries(struct cache_set *, struct btree *, struct closure *); -/* buffer up 4k before writing out a bset */ -#define BTREE_WRITE_SET_BUFFER (4 << 10) - #endif /* _BCACHE_BTREE_IO_H */ diff --git a/drivers/md/bcache/btree_types.h b/drivers/md/bcache/btree_types.h index 5b22b84188df..0d711ec91c20 100644 --- a/drivers/md/bcache/btree_types.h +++ b/drivers/md/bcache/btree_types.h @@ -46,6 +46,12 @@ struct btree { u8 level; u8 btree_id; u16 sib_u64s[2]; + u16 whiteout_u64s; + u16 uncompacted_whiteout_u64s; + + struct btree_keys keys; + struct btree_node *data; + /* * XXX: add a delete sequence number, so when btree_node_relock() fails * because the lock sequence number has changed - i.e. the contents were @@ -64,9 +70,6 @@ struct btree { */ struct list_head write_blocked; - struct btree_keys keys; - struct btree_node *data; - struct open_bucket *ob; /* lru list */ @@ -92,6 +95,7 @@ enum btree_flags { BTREE_NODE_write_idx, BTREE_NODE_accessed, BTREE_NODE_write_in_flight, + BTREE_NODE_just_written, }; BTREE_FLAG(read_error); @@ -100,6 +104,7 @@ BTREE_FLAG(dirty); BTREE_FLAG(write_idx); BTREE_FLAG(accessed); BTREE_FLAG(write_in_flight); +BTREE_FLAG(just_written); static inline struct btree_write *btree_current_write(struct btree *b) { diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c index a4e65a896f21..f3579b8a678c 100644 --- a/drivers/md/bcache/btree_update.c +++ b/drivers/md/bcache/btree_update.c @@ -655,7 +655,7 @@ static void bch_insert_fixup_btree_ptr(struct btree_iter *iter, /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ -void bch_btree_bset_insert_key(struct btree_iter *iter, +bool bch_btree_bset_insert_key(struct btree_iter *iter, struct btree *b, struct btree_node_iter *node_iter, struct bkey_i *insert) @@ -665,27 +665,69 @@ void bch_btree_bset_insert_key(struct btree_iter *iter, struct bset_tree *t; unsigned clobber_u64s; + EBUG_ON(btree_node_just_written(b)); + EBUG_ON(bset_written(b, btree_bset_last(b))); EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->c, b)); EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 || bkey_cmp(insert->k.p, b->data->max_key) > 0); + BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->c, b)); k = bch_btree_node_iter_peek_all(node_iter, &b->keys); if (k && !bkey_cmp_packed(f, k, &insert->k)) { + BUG_ON(bkey_whiteout(k)); + t = bch_bkey_to_bset(&b->keys, k); - if (!bkey_whiteout(k)) - btree_keys_account_key_drop(&b->keys.nr, - t - b->keys.set, k); + if (bset_unwritten(b, t->data) && + bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) { + BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k)); + + k->type = insert->k.type; + memcpy_u64s(bkeyp_val(f, k), &insert->v, + bkey_val_u64s(&insert->k)); + return true; + } + + insert->k.needs_whiteout = k->needs_whiteout; + + btree_keys_account_key_drop(&b->keys.nr, t - b->keys.set, k); if (t == bset_tree_last(&b->keys)) { clobber_u64s = k->u64s; + + /* + * If we're deleting, and the key we're deleting doesn't + * need a whiteout (it wasn't overwriting a key that had + * been written to disk) - just delete it: + */ + if (bkey_whiteout(&insert->k) && !k->needs_whiteout) { + bch_bset_delete(&b->keys, k, clobber_u64s); + bch_btree_node_iter_fix(iter, b, node_iter, t, + k, clobber_u64s, 0); + return true; + } + goto overwrite; } k->type = KEY_TYPE_DELETED; bch_btree_node_iter_fix(iter, b, node_iter, t, k, k->u64s, k->u64s); + + if (bkey_whiteout(&insert->k)) { + reserve_whiteout(b, t, k); + return true; + } else { + k->needs_whiteout = false; + } + } else { + /* + * Deleting, but the key to delete wasn't found - nothing to do: + */ + if (bkey_whiteout(&insert->k)) + return false; + + insert->k.needs_whiteout = false; } t = bset_tree_last(&b->keys); @@ -693,9 +735,10 @@ void bch_btree_bset_insert_key(struct btree_iter *iter, clobber_u64s = 0; overwrite: bch_bset_insert(&b->keys, node_iter, k, insert, clobber_u64s); - if (k->u64s != clobber_u64s || bkey_deleted(&insert->k)) + if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k)) bch_btree_node_iter_fix(iter, b, node_iter, t, k, clobber_u64s, k->u64s); + return true; } static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, @@ -720,7 +763,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, * shouldn't: */ if (!b->level) - __bch_btree_node_write(c, b, NULL, i); + bch_btree_node_write(c, b, NULL, SIX_LOCK_read, i); six_unlock_read(&b->lock); } @@ -755,9 +798,13 @@ void bch_btree_journal_key(struct btree_insert *trans, if (trans->journal_res.ref) { u64 seq = bch_journal_res_seq(j, &trans->journal_res); + bool needs_whiteout = insert->k.needs_whiteout; + /* ick */ + insert->k.needs_whiteout = false; bch_journal_add_keys(j, &trans->journal_res, b->btree_id, insert); + insert->k.needs_whiteout = needs_whiteout; if (trans->journal_seq) *trans->journal_seq = seq; @@ -776,11 +823,11 @@ bch_insert_fixup_key(struct btree_insert *trans, BUG_ON(iter->level); - bch_btree_journal_key(trans, iter, insert->k); - bch_btree_bset_insert_key(iter, - iter->nodes[0], - &iter->node_iters[0], - insert->k); + if (bch_btree_bset_insert_key(iter, + iter->nodes[0], + &iter->node_iters[0], + insert->k)) + bch_btree_journal_key(trans, iter, insert->k); trans->did_work = true; return BTREE_INSERT_OK; @@ -800,22 +847,19 @@ static void verify_keys_sorted(struct keylist *l) static void btree_node_lock_for_insert(struct btree *b, struct btree_iter *iter) { struct cache_set *c = iter->c; - struct bset *i; -relock: + btree_node_lock_write(b, iter); - i = btree_bset_last(b); + + if (btree_node_just_written(b) && + bch_btree_post_write_cleanup(c, b)) + bch_btree_iter_reinit_node(iter, b); /* * If the last bset has been written, or if it's gotten too big - start * a new bset to insert into: */ - if (next_bset_offset(c, b) < c->sb.btree_node_size && - (bset_written(b, i) || - le16_to_cpu(i->u64s) * sizeof(u64) > BTREE_WRITE_SET_BUFFER)) { - btree_node_unlock_write(b, iter); + if (want_new_bset(c, b)) bch_btree_init_next(c, b, iter); - goto relock; - } } /* Asynchronous interior node update machinery */ @@ -1877,6 +1921,12 @@ retry: if (!same_leaf_as_prev(trans, i)) u64s = 0; + /* + * bch_btree_node_insert_fits() must be called under write lock: + * with only an intent lock, another thread can still call + * bch_btree_node_write(), converting an unwritten bset to a + * written one + */ if (!i->done) { u64s += i->k->k.u64s; if (!bch_btree_node_insert_fits(c, diff --git a/drivers/md/bcache/btree_update.h b/drivers/md/bcache/btree_update.h index 4846c4790b4f..65b0421b6b9c 100644 --- a/drivers/md/bcache/btree_update.h +++ b/drivers/md/bcache/btree_update.h @@ -166,11 +166,28 @@ int bch_btree_root_alloc(struct cache_set *, enum btree_id, struct closure *); /* Inserting into a given leaf node (last stage of insert): */ -void bch_btree_bset_insert_key(struct btree_iter *, struct btree *, +bool bch_btree_bset_insert_key(struct btree_iter *, struct btree *, struct btree_node_iter *, struct bkey_i *); void bch_btree_journal_key(struct btree_insert *trans, struct btree_iter *, struct bkey_i *); +static inline void *btree_data_end(struct cache_set *c, struct btree *b) +{ + return (void *) b->data + btree_bytes(c); +} + +static inline struct bkey_packed *unwritten_whiteouts_start(struct cache_set *c, + struct btree *b) +{ + return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); +} + +static inline struct bkey_packed *unwritten_whiteouts_end(struct cache_set *c, + struct btree *b) +{ + return btree_data_end(c, b); +} + static inline void *write_block(struct btree *b) { return (void *) b->data + (b->written << 9); @@ -193,21 +210,13 @@ static inline unsigned bset_end_sector(struct cache_set *c, struct btree *b, block_bytes(c)) >> 9; } -static inline unsigned next_bset_offset(struct cache_set *c, struct btree *b) -{ - - unsigned offset = max_t(unsigned, b->written, - bset_end_sector(c, b, btree_bset_last(b))); - - EBUG_ON(offset > c->sb.btree_node_size); - return offset; -} - static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c, struct btree *b) { struct bset *i = btree_bset_last(b); - unsigned used = bset_byte_offset(b, bset_bkey_last(i)) / sizeof(u64); + unsigned used = bset_byte_offset(b, bset_bkey_last(i)) / sizeof(u64) + + b->whiteout_u64s + + b->uncompacted_whiteout_u64s; unsigned total = c->sb.btree_node_size << 6; EBUG_ON(used > total); @@ -218,6 +227,36 @@ static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c, return total - used; } +static inline unsigned btree_write_set_buffer(struct btree *b) +{ + /* + * Could buffer up larger amounts of keys for btrees with larger keys, + * pending benchmarking: + */ + return 4 << 10; +} + +static inline struct btree_node_entry *want_new_bset(struct cache_set *c, + struct btree *b) +{ + struct bset *i = btree_bset_last(b); + unsigned offset = max_t(unsigned, b->written << 9, + bset_byte_offset(b, bset_bkey_last(i))); + ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t) + (offset + sizeof(struct btree_node_entry) + + b->whiteout_u64s * sizeof(u64) + + b->uncompacted_whiteout_u64s * sizeof(u64)); + + EBUG_ON(offset > btree_bytes(c)); + + if ((unlikely(bset_written(b, i)) && n > 0) || + (unlikely(__set_bytes(i, le16_to_cpu(i->u64s)) > + btree_write_set_buffer(b)) && n > btree_write_set_buffer(b))) + return (void *) b->data + offset; + + return NULL; +} + /* * write lock must be held on @b (else the dirty bset that we were going to * insert into could be written out from under us) @@ -235,6 +274,27 @@ static inline bool bch_btree_node_insert_fits(struct cache_set *c, return u64s <= bch_btree_keys_u64s_remaining(c, b); } +static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + if (bset_written(b, t->data)) { + EBUG_ON(b->uncompacted_whiteout_u64s < + bkeyp_key_u64s(&b->keys.format, k)); + b->uncompacted_whiteout_u64s -= + bkeyp_key_u64s(&b->keys.format, k); + } +} + +static inline void reserve_whiteout(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + if (bset_written(b, t->data)) { + BUG_ON(!k->needs_whiteout); + b->uncompacted_whiteout_u64s += + bkeyp_key_u64s(&b->keys.format, k); + } +} + void bch_btree_insert_node(struct btree *, struct btree_iter *, struct keylist *, struct btree_reserve *, struct btree_interior_update *as); diff --git a/drivers/md/bcache/io_types.h b/drivers/md/bcache/io_types.h index 2760d0f0edd8..1cea24165f0e 100644 --- a/drivers/md/bcache/io_types.h +++ b/drivers/md/bcache/io_types.h @@ -74,6 +74,10 @@ struct bch_write_bio { bounce:1, put_bio:1; + /* Only for btree writes: */ + unsigned used_mempool:1; + u8 order; + struct bio bio; }; diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 5f2558b7a096..2f1cc61be66e 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -109,7 +109,13 @@ struct bkey { __u8 u64s; /* Format of key (0 for format local to btree node) */ - __u8 format; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 format:7, + needs_whiteout:1; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 needs_whiteout:1, + format:7; +#endif /* Type of the value */ __u8 type; @@ -136,7 +142,19 @@ struct bkey_packed { __u8 u64s; /* Format of key (0 for format local to btree node) */ - __u8 format; + + /* + * XXX: next incompat on disk format change, switch format and + * needs_whiteout - bkey_packed() will be cheaper if format is the high + * bits of the bitfield + */ +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 format:7, + needs_whiteout:1; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 needs_whiteout:1, + format:7; +#endif /* Type of the value */ __u8 type; @@ -256,6 +274,7 @@ struct bkey_i_##name { \ #define KEY_TYPE_DISCARD 1 #define KEY_TYPE_ERROR 2 #define KEY_TYPE_COOKIE 3 +#define KEY_TYPE_PERSISTENT_DISCARD 4 #define KEY_TYPE_GENERIC_NR 128 struct bch_cookie { |