summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-11-06 08:00:44 -0900
committerKent Overstreet <kent.overstreet@gmail.com>2017-01-18 21:41:07 -0900
commitd0844f5e00edab7a90a0ddcf612e5e76c458b9f4 (patch)
treea8568d9faa02944baabe45ea0f861927b84a6a9f
parentb2dfeb671b53f608ab7299aee1bd5958c82ba367 (diff)
bcache: Optimize btree node writes
-rw-r--r--drivers/md/bcache/bkey.h9
-rw-r--r--drivers/md/bcache/bset.c52
-rw-r--r--drivers/md/bcache/bset.h16
-rw-r--r--drivers/md/bcache/btree_gc.c2
-rw-r--r--drivers/md/bcache/btree_io.c269
-rw-r--r--drivers/md/bcache/btree_io.h10
-rw-r--r--drivers/md/bcache/btree_update.c44
-rw-r--r--drivers/md/bcache/btree_update.h17
-rw-r--r--drivers/md/bcache/extents.c6
9 files changed, 234 insertions, 191 deletions
diff --git a/drivers/md/bcache/bkey.h b/drivers/md/bcache/bkey.h
index b562ac04c4d9..1466e890dad1 100644
--- a/drivers/md/bcache/bkey.h
+++ b/drivers/md/bcache/bkey.h
@@ -73,6 +73,9 @@ static inline void set_bkey_deleted(struct bkey *k)
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_DELETED)
+#define bkey_whiteout(_k) \
+ ((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD)
+
static inline void __memcpy_u64s(void *dst, const void *src,
unsigned u64s)
{
@@ -354,6 +357,12 @@ static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
return bkeyp_val_u64s(format, k) * sizeof(u64);
}
+static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
+ struct bkey_packed *k, unsigned val_u64s)
+{
+ k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
+}
+
#define bkeyp_val(_format, _k) \
((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 0b581b9203f8..f06b5eeb5d7e 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -145,7 +145,7 @@ void __bch_verify_btree_nr_keys(struct btree_keys *b)
for (k = t->data->start;
k != bset_bkey_last(t->data);
k = bkey_next(k))
- if (!bkey_packed_is_whiteout(b, k))
+ if (!bkey_whiteout(k))
btree_keys_account_key_add(&nr, t - b->set, k);
BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
@@ -1116,7 +1116,7 @@ void bch_bset_insert(struct btree_keys *b,
if (bkey_pack_key(&packed, &insert->k, f))
src = &packed;
- if (!bkey_is_whiteout(&insert->k))
+ if (!bkey_whiteout(&insert->k))
btree_keys_account_key_add(&b->nr, t - b->set, src);
if (src->u64s != clobber_u64s) {
@@ -1612,53 +1612,7 @@ struct bkey_s_c bch_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
}
EXPORT_SYMBOL(bch_btree_node_iter_peek_unpack);
-bool bch_maybe_compact_deleted_keys(struct btree_keys *b)
-{
- struct bset_tree *t, *rebuild_from = NULL;
- bool last_set_aux_tree_ro = bset_has_ro_aux_tree(bset_tree_last(b));
-
- for_each_bset(b, t) {
- struct bset *i = t->data;
- struct bkey_packed *k, *n, *out = i->start;
-
- if (b->nr.bset_u64s[t - b->set] * 4 > le16_to_cpu(i->u64s) * 3)
- continue;
-
- /*
- * We cannot drop deleted keys from the last bset, if it hasn't
- * been written yet - we may need that whiteout on disk:
- *
- * XXX unless they're extents, if we fix assertions elsewhere
- */
- if (t == bset_tree_last(b) && !last_set_aux_tree_ro)
- break;
-
- for (k = i->start; k != bset_bkey_last(i); k = n) {
- n = bkey_next(k);
-
- if (!bkey_packed_is_whiteout(b, k)) {
- bkey_copy(out, k);
- out = bkey_next(out);
- }
- }
-
- i->u64s = cpu_to_le16((u64 *) out - i->_data);
- bch_bset_set_no_aux_tree(b, t);
-
- if (!rebuild_from)
- rebuild_from = t;
- }
-
- if (!rebuild_from)
- return false;
-
- for (t = rebuild_from; t < b->set + b->nsets; t++)
- bch_bset_build_aux_tree(b, t,
- t == bset_tree_last(b) &&
- !last_set_aux_tree_ro);
-
- return true;
-}
+/* Mergesort */
void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
{
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index aba5ae3f99ed..361c8dafd39a 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -564,20 +564,6 @@ struct bkey_s_c bch_btree_node_iter_peek_unpack(struct btree_node_iter *,
/* Accounting: */
-static inline bool bkey_is_whiteout(const struct bkey *k)
-{
- return bkey_deleted(k) ||
- (k->type == KEY_TYPE_DISCARD && !k->version);
-}
-
-static inline bool bkey_packed_is_whiteout(const struct btree_keys *b,
- const struct bkey_packed *k)
-{
- return bkey_deleted(k) ||
- (k->type == KEY_TYPE_DISCARD &&
- !bkey_unpack_key(&b->format, k).version);
-}
-
static inline void btree_keys_account_key(struct btree_nr_keys *n,
unsigned bset,
struct bkey_packed *k,
@@ -597,8 +583,6 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n,
#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \
btree_keys_account_key(_nr, _bset_idx, _k, -1)
-bool bch_maybe_compact_deleted_keys(struct btree_keys *);
-
struct bset_stats {
struct {
size_t nr, bytes;
diff --git a/drivers/md/bcache/btree_gc.c b/drivers/md/bcache/btree_gc.c
index b25398826f42..e0b08e137094 100644
--- a/drivers/md/bcache/btree_gc.c
+++ b/drivers/md/bcache/btree_gc.c
@@ -591,7 +591,7 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
bch_btree_build_aux_trees(n);
six_unlock_write(&n->lock);
- bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, NULL, -1);
+ bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
}
/*
diff --git a/drivers/md/bcache/btree_io.c b/drivers/md/bcache/btree_io.c
index 8345c733edfb..a467fd4c4421 100644
--- a/drivers/md/bcache/btree_io.c
+++ b/drivers/md/bcache/btree_io.c
@@ -127,12 +127,82 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
return ret;
}
+bool bch_maybe_compact_whiteouts(struct cache_set *c, struct btree *b)
+{
+ struct bset_tree *t, *rebuild_from = NULL;
+
+ for_each_bset(&b->keys, t) {
+ struct bset *i = t->data;
+ struct bkey_packed *k, *n, *out = i->start;
+ unsigned live_u64s = b->keys.nr.bset_u64s[t - b->keys.set];
+
+ if (live_u64s * 4 > le16_to_cpu(i->u64s) * 3)
+ continue;
+
+ if (!bset_written(b, i))
+ continue;
+
+ for (k = i->start; k != bset_bkey_last(i); k = n) {
+ n = bkey_next(k);
+
+ if (!bkey_whiteout(k)) {
+ bkey_copy(out, k);
+ out = bkey_next(out);
+ } else if (!bset_written(b, i)) {
+ /*
+ * We cannot drop whiteouts that haven't been
+ * written out - we can drop the value, though:
+ *
+ * XXX we could drop 0 size extents, if we fix
+ * assertions elsewhere
+ */
+ set_bkeyp_val_u64s(&b->keys.format, k, 0);
+ bkey_copy(out, k);
+ out = bkey_next(out);
+ }
+ }
+
+ i->u64s = cpu_to_le16((u64 *) out - i->_data);
+ bch_bset_set_no_aux_tree(&b->keys, t);
+ if (!rebuild_from)
+ rebuild_from = t;
+ }
+
+ if (!rebuild_from)
+ return false;
+
+ for (t = rebuild_from; t < b->keys.set + b->keys.nsets; t++) {
+ /*
+ * Compacting a set that hasn't been written out yet might mean
+ * we need to shift the set after it down:
+ */
+ if (!bset_written(b, t->data) &&
+ t != bset_tree_last(&b->keys)) {
+ struct btree_node_entry *dst = (void *) b->data +
+ (bset_end_sector(c, b, t->data) << 9);
+ struct btree_node_entry *src = container_of(t[1].data,
+ struct btree_node_entry, keys);
+
+ if (dst != src) {
+ memmove(dst, src,
+ (void *) bset_bkey_last(&src->keys) -
+ (void *) src);
+ t[1].data = &dst->keys;
+ }
+ }
+ }
+
+ bch_btree_build_aux_trees(b);
+ bch_verify_btree_nr_keys(&b->keys);
+ return true;
+}
+
static inline int sort_keys_cmp(struct btree_keys *b,
struct bkey_packed *l,
struct bkey_packed *r)
{
return bkey_cmp_packed(&b->format, l, r) ?:
- (int) bkey_deleted(r) - (int) bkey_deleted(l);
+ (int) bkey_whiteout(r) - (int) bkey_whiteout(l);
}
static unsigned sort_keys(struct bkey_packed *dst,
@@ -148,7 +218,7 @@ static unsigned sort_keys(struct bkey_packed *dst,
!bkey_cmp_packed(&iter->b->format, in, next))
continue;
- if (filter_whiteouts && bkey_deleted(in))
+ if (filter_whiteouts && bkey_whiteout(in))
continue;
bkey_copy(out, in);
@@ -178,7 +248,7 @@ static unsigned sort_extents(struct bkey_packed *dst,
if (bkey_deleted(in))
continue;
- if (filter_whiteouts && bkey_packed_is_whiteout(iter->b, in))
+ if (filter_whiteouts && bkey_whiteout(in))
continue;
bkey_copy(out, in);
@@ -189,7 +259,9 @@ static unsigned sort_extents(struct bkey_packed *dst,
}
static void btree_node_sort(struct cache_set *c, struct btree *b,
- struct btree_iter *iter, unsigned from,
+ struct btree_iter *iter,
+ unsigned start_idx,
+ unsigned end_idx,
bool is_write_locked)
{
struct btree_node *out;
@@ -197,13 +269,14 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
struct bset_tree *t;
bool used_mempool = false, filter_whiteouts;
u64 start_time;
- unsigned i, u64s, order, shift = b->keys.nsets - from - 1;
- bool sorting_entire_node = from == 0;
+ unsigned i, u64s, order, shift = end_idx - start_idx - 1;
+ bool sorting_entire_node = start_idx == 0 &&
+ end_idx == b->keys.nsets;
sort_iter_init(&sort_iter, &b->keys);
- for (t = b->keys.set + from;
- t < b->keys.set + b->keys.nsets;
+ for (t = b->keys.set + start_idx;
+ t < b->keys.set + end_idx;
t++) {
u64s += le16_to_cpu(t->data->u64s);
sort_iter_add(&sort_iter, t->data->start,
@@ -218,7 +291,7 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
start_time = local_clock();
- filter_whiteouts = bset_written(b, b->keys.set[from].data);
+ filter_whiteouts = bset_written(b, b->keys.set[start_idx].data);
u64s = btree_node_is_extents(b)
? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
@@ -229,18 +302,18 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
BUG_ON((void *) bset_bkey_last(&out->keys) >
(void *) out + (PAGE_SIZE << order));
- if (!from)
+ if (sorting_entire_node)
bch_time_stats_update(&c->btree_sort_time, start_time);
if (!is_write_locked)
__btree_node_lock_write(b, iter);
/* Make sure we preserve bset journal_seq: */
- for (t = b->keys.set + from + 1;
- t < b->keys.set + b->keys.nsets;
+ for (t = b->keys.set + start_idx + 1;
+ t < b->keys.set + end_idx;
t++)
- b->keys.set[from].data->journal_seq =
- max(b->keys.set[from].data->journal_seq,
+ b->keys.set[start_idx].data->journal_seq =
+ max(b->keys.set[start_idx].data->journal_seq,
t->data->journal_seq);
if (sorting_entire_node) {
@@ -258,19 +331,19 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
swap(out, b->data);
b->keys.set->data = &b->data->keys;
} else {
- b->keys.set[from].data->u64s = out->keys.u64s;
- memcpy_u64s(b->keys.set[from].data->start,
+ b->keys.set[start_idx].data->u64s = out->keys.u64s;
+ memcpy_u64s(b->keys.set[start_idx].data->start,
out->keys.start,
le16_to_cpu(out->keys.u64s));
}
- for (i = from + 1; i < b->keys.nsets; i++)
- b->keys.nr.bset_u64s[from] +=
+ for (i = start_idx + 1; i < end_idx; i++)
+ b->keys.nr.bset_u64s[start_idx] +=
b->keys.nr.bset_u64s[i];
b->keys.nsets -= shift;
- for (i = from + 1; i < b->keys.nsets; i++) {
+ for (i = start_idx + 1; i < b->keys.nsets; i++) {
b->keys.nr.bset_u64s[i] = b->keys.nr.bset_u64s[i + shift];
b->keys.set[i] = b->keys.set[i + shift];
}
@@ -278,7 +351,7 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
for (i = b->keys.nsets; i < MAX_BSETS; i++)
b->keys.nr.bset_u64s[i] = 0;
- bch_bset_set_no_aux_tree(&b->keys, &b->keys.set[from]);
+ bch_bset_set_no_aux_tree(&b->keys, &b->keys.set[start_idx]);
if (!is_write_locked)
__btree_node_unlock_write(b, iter);
@@ -302,7 +375,7 @@ static struct btree_nr_keys sort_repack(struct bset *dst,
memset(&nr, 0, sizeof(nr));
while ((in = bch_btree_node_iter_next_all(src_iter, src))) {
- if (filter_whiteouts && bkey_packed_is_whiteout(src, in))
+ if (filter_whiteouts && bkey_whiteout(in))
continue;
if (bch_bkey_transform(out_f, out, bkey_packed(in)
@@ -337,7 +410,7 @@ static struct btree_nr_keys sort_repack_merge(struct cache_set *c,
memset(&nr, 0, sizeof(nr));
while ((k = bch_btree_node_iter_next_all(iter, src))) {
- if (filter_whiteouts && bkey_packed_is_whiteout(src, k))
+ if (filter_whiteouts && bkey_whiteout(k))
continue;
/*
@@ -434,35 +507,27 @@ void bch_btree_sort_into(struct cache_set *c,
static bool btree_node_compact(struct cache_set *c, struct btree *b,
struct btree_iter *iter)
{
- unsigned crit = SORT_CRIT;
- unsigned crit_factor = int_sqrt(btree_pages(c));
- int i = 0;
-
- /* Don't sort if nothing to do */
- if (b->keys.nsets == 1)
- return false;
+ unsigned unwritten_idx;
+ bool ret = false;
- /* If not a leaf node, always sort */
- if (b->level)
- goto sort;
-
- for (i = b->keys.nsets - 2; i >= 0; --i) {
- crit *= crit_factor;
+ for (unwritten_idx = 0;
+ unwritten_idx < b->keys.nsets;
+ unwritten_idx++)
+ if (!bset_written(b, b->keys.set[unwritten_idx].data))
+ break;
- if (le16_to_cpu(b->keys.set[i].data->u64s) < crit)
- goto sort;
+ if (b->keys.nsets - unwritten_idx > 1) {
+ btree_node_sort(c, b, iter, unwritten_idx,
+ b->keys.nsets, false);
+ ret = true;
}
- /* Sort if we'd overflow */
- if (b->keys.nsets == MAX_BSETS) {
- i = 0;
- goto sort;
+ if (unwritten_idx > 1) {
+ btree_node_sort(c, b, iter, 0, unwritten_idx, false);
+ ret = true;
}
- return false;
-sort:
- btree_node_sort(c, b, iter, i, false);
- return true;
+ return ret;
}
void bch_btree_build_aux_trees(struct btree *b)
@@ -487,7 +552,9 @@ void bch_btree_build_aux_trees(struct btree *b)
void bch_btree_init_next(struct cache_set *c, struct btree *b,
struct btree_iter *iter)
{
+ struct bset *i;
bool did_sort;
+ unsigned offset;
BUG_ON(iter && iter->nodes[b->level] != b);
@@ -499,8 +566,17 @@ void bch_btree_init_next(struct cache_set *c, struct btree *b,
__btree_node_lock_write(b, iter);
- if (b->written < c->sb.btree_node_size) {
- struct btree_node_entry *bne = write_block(b);
+ /*
+ * If the last bset has been written, or if it's gotten too big - start
+ * a new bset to insert into:
+ */
+ i = btree_bset_last(b);
+ offset = next_bset_offset(c, b);
+
+ if (offset < c->sb.btree_node_size &&
+ (bset_written(b, i) ||
+ le16_to_cpu(i->u64s) * sizeof(u64) > BTREE_WRITE_SET_BUFFER)) {
+ struct btree_node_entry *bne = (void *) b->data + (offset << 9);
bch_bset_init_next(&b->keys, &bne->keys);
}
@@ -612,7 +688,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
if (BSET_SEPARATE_WHITEOUTS(i) &&
!seen_non_whiteout &&
- !bkey_packed_is_whiteout(&b->keys, k)) {
+ !bkey_whiteout(k)) {
*whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true;
}
@@ -928,7 +1004,8 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
{
struct bio *bio;
struct bch_write_bio *wbio;
- struct bset *i;
+ struct bset_tree *t;
+ struct bset *last_set_to_write = NULL;
BKEY_PADDED(key) k;
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
@@ -965,38 +1042,64 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
#endif
trace_bcache_btree_write(c, b);
- i = btree_bset_last(b);
BUG_ON(b->written >= c->sb.btree_node_size);
- BUG_ON(b->written && !i->u64s);
- BUG_ON(btree_bset_first(b)->seq != i->seq);
- BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
+ BUG_ON(bset_written(b, btree_bset_last(b)));
+ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(&c->disk_sb));
+ BUG_ON(memcmp(&b->data->format, &b->keys.format,
+ sizeof(b->keys.format)));
change_bit(BTREE_NODE_write_idx, &b->flags);
- i->version = cpu_to_le16(BCACHE_BSET_VERSION);
+ for_each_bset(&b->keys, t)
+ if (!bset_written(b, t->data))
+ break;
- SET_BSET_CSUM_TYPE(i, c->opts.metadata_checksum);
+ if (t != bset_tree_last(&b->keys)) {
+ /*
+ * See if we can get a write lock and compact these bsets
+ * together:
+ */
+ }
- if (!b->written) {
- BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(&c->disk_sb));
+ for_each_bset(&b->keys, t) {
+ struct bset *i = t->data;
- b->data->format = b->keys.format;
- data = b->data;
- b->data->csum = cpu_to_le64(btree_csum_set(b, b->data));
- sectors_to_write = __set_blocks(b->data,
- le16_to_cpu(b->data->keys.u64s),
- block_bytes(c)) << c->block_bits;
+ if (bset_written(b, i))
+ continue;
- } else {
- struct btree_node_entry *bne = write_block(b);
+ BUG_ON(i->seq != b->data->keys.seq);
+ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
+ BUG_ON(i != btree_bset_last(b) &&
+ bset_end_sector(c, b, i) << 9 !=
+ bset_byte_offset(b, container_of(t[1].data,
+ struct btree_node_entry, keys)));
- data = bne;
- bne->csum = cpu_to_le64(btree_csum_set(b, bne));
- sectors_to_write = __set_blocks(bne,
- le16_to_cpu(bne->keys.u64s),
- block_bytes(c)) << c->block_bits;
+ /*
+ * We may have merely initialized a bset but not written
+ * anything to it yet:
+ */
+ if (i != btree_bset_first(b) && !i->u64s)
+ continue;
+
+ last_set_to_write = i;
+
+ i->version = cpu_to_le16(BCACHE_BSET_VERSION);
+ SET_BSET_CSUM_TYPE(i, c->opts.metadata_checksum);
+
+ if (t == b->keys.set) {
+ b->data->csum = cpu_to_le64(btree_csum_set(b, b->data));
+ } else {
+ struct btree_node_entry *bne = container_of(i,
+ struct btree_node_entry, keys);
+
+ bne->csum = cpu_to_le64(btree_csum_set(b, bne));
+ }
}
+ BUG_ON(!last_set_to_write);
+ sectors_to_write = bset_end_sector(c, b, last_set_to_write) - b->written;
+ data = write_block(b);
+
BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
/*
@@ -1070,7 +1173,6 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
void bch_btree_node_write(struct cache_set *c, struct btree *b,
struct closure *parent,
enum six_lock_type lock_type_held,
- struct btree_iter *iter,
int idx_to_write)
{
BUG_ON(lock_type_held == SIX_LOCK_write);
@@ -1080,7 +1182,7 @@ void bch_btree_node_write(struct cache_set *c, struct btree *b,
if (lock_type_held == SIX_LOCK_intent ||
six_trylock_convert(&b->lock, SIX_LOCK_read,
SIX_LOCK_intent)) {
- bch_btree_init_next(c, b, iter);
+ bch_btree_init_next(c, b, NULL);
if (lock_type_held == SIX_LOCK_read)
six_lock_downgrade(&b->lock);
@@ -1093,31 +1195,10 @@ static void bch_btree_node_write_dirty(struct cache_set *c, struct btree *b,
six_lock_read(&b->lock);
BUG_ON(b->level);
- bch_btree_node_write(c, b, parent, SIX_LOCK_read, NULL, -1);
+ bch_btree_node_write(c, b, parent, SIX_LOCK_read, -1);
six_unlock_read(&b->lock);
}
-/* buffer up 4k before writing out a bset */
-#define BTREE_WRITE_SET_BUFFER (4 << 10)
-
-/*
- * Write leaf nodes if the unwritten bset is getting too big:
- */
-void bch_btree_node_write_lazy(struct cache_set *c, struct btree *b,
- struct btree_iter *iter)
-{
- struct btree_node_entry *bne =
- container_of(btree_bset_last(b),
- struct btree_node_entry, keys);
- unsigned long bytes = __set_bytes(bne, le16_to_cpu(bne->keys.u64s));
-
- if ((max(round_up(bytes, block_bytes(iter->c)),
- PAGE_SIZE) - bytes < 48 ||
- bytes > BTREE_WRITE_SET_BUFFER) &&
- !btree_node_write_in_flight(b))
- bch_btree_node_write(c, b, NULL, SIX_LOCK_intent, iter, -1);
-}
-
/*
* Write all dirty btree nodes to disk, including roots
*/
diff --git a/drivers/md/bcache/btree_io.h b/drivers/md/bcache/btree_io.h
index 701e86d484a4..55c021acd9d9 100644
--- a/drivers/md/bcache/btree_io.h
+++ b/drivers/md/bcache/btree_io.h
@@ -19,6 +19,8 @@ static inline void btree_node_io_lock(struct btree *b)
TASK_UNINTERRUPTIBLE);
}
+bool bch_maybe_compact_whiteouts(struct cache_set *, struct btree *);
+
void bch_btree_sort_into(struct cache_set *, struct btree *, struct btree *);
void bch_btree_build_aux_trees(struct btree *);
@@ -37,13 +39,13 @@ void bch_btree_complete_write(struct cache_set *, struct btree *,
void __bch_btree_node_write(struct cache_set *, struct btree *,
struct closure *, int);
void bch_btree_node_write(struct cache_set *, struct btree *,
- struct closure *, enum six_lock_type,
- struct btree_iter *, int);
-void bch_btree_node_write_lazy(struct cache_set *, struct btree *,
- struct btree_iter *);
+ struct closure *, enum six_lock_type, int);
void bch_btree_flush(struct cache_set *);
void bch_btree_node_flush_journal_entries(struct cache_set *, struct btree *,
struct closure *);
+/* buffer up 4k before writing out a bset */
+#define BTREE_WRITE_SET_BUFFER (4 << 10)
+
#endif /* _BCACHE_BTREE_IO_H */
diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c
index cdb11a8734d0..a4e65a896f21 100644
--- a/drivers/md/bcache/btree_update.c
+++ b/drivers/md/bcache/btree_update.c
@@ -34,7 +34,7 @@ void __bch_btree_calc_format(struct bkey_format_state *s, struct btree *b)
for (k = t->data->start;
k != bset_bkey_last(t->data);
k = bkey_next(k))
- if (!bkey_packed_is_whiteout(&b->keys, k)) {
+ if (!bkey_whiteout(k)) {
uk = bkey_unpack_key(&b->keys.format, k);
bch_bkey_format_add_key(s, &uk);
}
@@ -605,7 +605,7 @@ int bch_btree_root_alloc(struct cache_set *c, enum btree_id id,
b = __btree_root_alloc(c, 0, id, reserve);
- bch_btree_node_write(c, b, writes, SIX_LOCK_intent, NULL, -1);
+ bch_btree_node_write(c, b, writes, SIX_LOCK_intent, -1);
bch_btree_set_root_initial(c, b, reserve);
btree_open_bucket_put(c, b);
@@ -674,7 +674,7 @@ void bch_btree_bset_insert_key(struct btree_iter *iter,
if (k && !bkey_cmp_packed(f, k, &insert->k)) {
t = bch_bkey_to_bset(&b->keys, k);
- if (!bkey_packed_is_whiteout(&b->keys, k))
+ if (!bkey_whiteout(k))
btree_keys_account_key_drop(&b->keys.nr,
t - b->keys.set, k);
@@ -800,16 +800,18 @@ static void verify_keys_sorted(struct keylist *l)
static void btree_node_lock_for_insert(struct btree *b, struct btree_iter *iter)
{
struct cache_set *c = iter->c;
+ struct bset *i;
relock:
btree_node_lock_write(b, iter);
+ i = btree_bset_last(b);
/*
- * If the last bset has been written, initialize a new one - check after
- * taking the write lock because it can be written with only a read
- * lock:
+ * If the last bset has been written, or if it's gotten too big - start
+ * a new bset to insert into:
*/
- if (b->written != c->sb.btree_node_size &&
- bset_written(b, btree_bset_last(b))) {
+ if (next_bset_offset(c, b) < c->sb.btree_node_size &&
+ (bset_written(b, i) ||
+ le16_to_cpu(i->u64s) * sizeof(u64) > BTREE_WRITE_SET_BUFFER)) {
btree_node_unlock_write(b, iter);
bch_btree_init_next(c, b, iter);
goto relock;
@@ -914,8 +916,7 @@ retry:
list_del(&as->write_blocked_list);
if (list_empty(&b->write_blocked))
- bch_btree_node_write(c, b, NULL, SIX_LOCK_read,
- NULL, -1);
+ bch_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
six_unlock_read(&b->lock);
break;
@@ -1159,7 +1160,7 @@ bch_btree_insert_keys_interior(struct btree *b,
btree_node_lock_for_insert(b, iter);
if (bch_keylist_u64s(insert_keys) >
- bch_btree_keys_u64s_remaining(iter->c, b)) {
+ bch_btree_keys_u64s_remaining(c, b)) {
btree_node_unlock_write(b, iter);
return BTREE_INSERT_BTREE_NODE_FULL;
}
@@ -1184,7 +1185,7 @@ bch_btree_insert_keys_interior(struct btree *b,
bch_keylist_pop_front(insert_keys);
}
- btree_interior_update_updated_btree(iter->c, as, b);
+ btree_interior_update_updated_btree(c, as, b);
for_each_linked_btree_node(iter, b, linked)
bch_btree_node_iter_peek(&linked->node_iters[b->level],
@@ -1193,7 +1194,7 @@ bch_btree_insert_keys_interior(struct btree *b,
bch_btree_iter_verify(iter, b);
- if (bch_maybe_compact_deleted_keys(&b->keys))
+ if (bch_maybe_compact_whiteouts(c, b))
bch_btree_iter_reinit_node(iter, b);
btree_node_unlock_write(b, iter);
@@ -1375,7 +1376,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
six_unlock_write(&n2->lock);
six_unlock_write(&n1->lock);
- bch_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent, NULL, -1);
+ bch_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent, -1);
/*
* Note that on recursive parent_keys == insert_keys, so we
@@ -1395,8 +1396,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
btree_split_insert_keys(iter, n3, &as->parent_keys,
reserve);
- bch_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent,
- NULL, -1);
+ bch_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent, -1);
}
} else {
trace_bcache_btree_node_compact(c, b, b->keys.nr.live_u64s);
@@ -1407,7 +1407,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
bch_keylist_add(&as->parent_keys, &n1->key);
}
- bch_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent, NULL, -1);
+ bch_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent, -1);
/* New nodes all written, now make them visible: */
@@ -1706,7 +1706,7 @@ retry:
bch_keylist_add(&as->parent_keys, &delete);
bch_keylist_add(&as->parent_keys, &n->key);
- bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, NULL, -1);
+ bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
bch_btree_insert_node(parent, iter, &as->parent_keys, reserve, as);
@@ -1768,7 +1768,7 @@ btree_insert_key(struct btree_insert *trans,
b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
if (u64s_added > live_u64s_added &&
- bch_maybe_compact_deleted_keys(&b->keys))
+ bch_maybe_compact_whiteouts(iter->c, b))
bch_btree_iter_reinit_node(iter, b);
trace_bcache_btree_insert_key(c, b, insert->k);
@@ -1944,10 +1944,6 @@ unlock:
if (!same_leaf_as_prev(trans, i)) {
foreground_maybe_merge(i->iter, btree_prev_sib);
foreground_maybe_merge(i->iter, btree_next_sib);
-
- if (btree_node_relock(i->iter, 0))
- bch_btree_node_write_lazy(c, i->iter->nodes[0],
- i->iter);
}
out:
/* make sure we didn't lose an error: */
@@ -2238,7 +2234,7 @@ int bch_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
trace_bcache_btree_gc_rewrite_node(c, b);
- bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, NULL, -1);
+ bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
if (parent) {
bch_btree_insert_node(parent, iter,
diff --git a/drivers/md/bcache/btree_update.h b/drivers/md/bcache/btree_update.h
index 4311dd17f82a..4846c4790b4f 100644
--- a/drivers/md/bcache/btree_update.h
+++ b/drivers/md/bcache/btree_update.h
@@ -186,6 +186,23 @@ static inline bool bset_unwritten(struct btree *b, struct bset *i)
return (void *) i > write_block(b);
}
+static inline unsigned bset_end_sector(struct cache_set *c, struct btree *b,
+ struct bset *i)
+{
+ return round_up(bset_byte_offset(b, bset_bkey_last(i)),
+ block_bytes(c)) >> 9;
+}
+
+static inline unsigned next_bset_offset(struct cache_set *c, struct btree *b)
+{
+
+ unsigned offset = max_t(unsigned, b->written,
+ bset_end_sector(c, b, btree_bset_last(b)));
+
+ EBUG_ON(offset > c->sb.btree_node_size);
+ return offset;
+}
+
static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c,
struct btree *b)
{
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 5419b1f09738..bd3670331102 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -58,7 +58,7 @@ static inline bool should_drop_next_key(struct btree_node_iter *iter,
struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
- if (bkey_packed_is_whiteout(b, k))
+ if (bkey_whiteout(k))
return true;
if (iter->used < 2)
@@ -741,7 +741,7 @@ static void extent_sort_append(struct cache_set *c,
struct bkey_format *f = &b->format;
BKEY_PADDED(k) tmp;
- if (bkey_packed_is_whiteout(b, k))
+ if (bkey_whiteout(k))
return;
bkey_unpack(&tmp.k, f, k);
@@ -1416,7 +1416,7 @@ bch_insert_fixup_extent(struct btree_insert *trans,
struct bpos orig_pos = k.k->p;
/* The insert key completely covers k, invalidate k */
- if (!bkey_is_whiteout(k.k))
+ if (!bkey_whiteout(k.k))
btree_keys_account_key_drop(&b->keys.nr,
t - b->keys.set, _k);