summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-11-17 11:42:17 -0900
committerKent Overstreet <kent.overstreet@gmail.com>2017-01-18 21:41:08 -0900
commitb637fe1816a0109680a3b62b77997ac23e55d821 (patch)
tree56b3747c621525c411fd59d2bed1cb98b7131a04
parentd0844f5e00edab7a90a0ddcf612e5e76c458b9f4 (diff)
bcache: Move unwritten whiteouts to a separate area
-rw-r--r--drivers/md/bcache/bkey.c8
-rw-r--r--drivers/md/bcache/bset.c6
-rw-r--r--drivers/md/bcache/bset.h6
-rw-r--r--drivers/md/bcache/btree_cache.c16
-rw-r--r--drivers/md/bcache/btree_io.c548
-rw-r--r--drivers/md/bcache/btree_io.h7
-rw-r--r--drivers/md/bcache/btree_types.h11
-rw-r--r--drivers/md/bcache/btree_update.c92
-rw-r--r--drivers/md/bcache/btree_update.h84
-rw-r--r--drivers/md/bcache/io_types.h4
-rw-r--r--include/uapi/linux/bcache.h23
11 files changed, 622 insertions, 183 deletions
diff --git a/drivers/md/bcache/bkey.c b/drivers/md/bcache/bkey.c
index 4f1f5317ce43..cf46ef55e0ed 100644
--- a/drivers/md/bcache/bkey.c
+++ b/drivers/md/bcache/bkey.c
@@ -232,8 +232,12 @@ static bool bch_bkey_transform_key(const struct bkey_format *out_f,
if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
return false;
+ /* Can't happen because the val would be too big to unpack: */
+ EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
+
pack_state_finish(&out_s, out);
out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s;
+ out->needs_whiteout = in->needs_whiteout;
out->type = in->type;
return true;
@@ -262,9 +266,11 @@ static struct bkey __bkey_unpack_key(const struct bkey_format *format,
EBUG_ON(format->nr_fields != 5);
EBUG_ON(in->u64s < format->key_u64s);
EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+ EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
out.u64s = BKEY_U64s + in->u64s - format->key_u64s;
out.format = KEY_FORMAT_CURRENT;
+ out.needs_whiteout = in->needs_whiteout;
out.type = in->type;
out.pad[0] = 0;
out.p.inode = get_inc_field(&state, BKEY_FIELD_INODE);
@@ -301,6 +307,7 @@ bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
{
struct pack_state state = pack_state_init(format, out);
+ EBUG_ON((void *) in == (void *) out);
EBUG_ON(format->nr_fields != 5);
EBUG_ON(in->format != KEY_FORMAT_CURRENT);
@@ -323,6 +330,7 @@ bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
pack_state_finish(&state, out);
out->u64s = format->key_u64s + in->u64s - BKEY_U64s;
out->format = KEY_FORMAT_LOCAL_BTREE;
+ out->needs_whiteout = in->needs_whiteout;
out->type = in->type;
bch_bkey_pack_verify(out, in, format);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index f06b5eeb5d7e..f993a508e84c 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -202,7 +202,8 @@ void bch_btree_node_iter_verify(struct btree_node_iter *iter,
if (bch_btree_node_iter_bset_pos(iter, b, t->data) ==
bset_bkey_last(t->data) &&
(k = bkey_prev_all(t, bset_bkey_last(t->data))))
- BUG_ON(__btree_node_iter_cmp(iter, b, k, first) > 0);
+ BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
+ k, first) > 0);
}
void bch_verify_key_order(struct btree_keys *b,
@@ -1557,7 +1558,8 @@ struct bkey_packed *bch_btree_node_iter_prev_all(struct btree_node_iter *iter,
k = bkey_prev_all(t,
bch_btree_node_iter_bset_pos(iter, b, t->data));
if (k &&
- (!prev || __btree_node_iter_cmp(iter, b, k, prev) > 0)) {
+ (!prev || __btree_node_iter_cmp(iter->is_extents, b,
+ k, prev) > 0)) {
prev = k;
prev_i = t->data;
}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 361c8dafd39a..32c715cfea16 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -460,7 +460,7 @@ __btree_node_offset_to_key(struct btree_keys *b, u16 k)
return (void *) ((u64 *) b->set->data + k);
}
-static inline int __btree_node_iter_cmp(struct btree_node_iter *iter,
+static inline int __btree_node_iter_cmp(bool is_extents,
struct btree_keys *b,
struct bkey_packed *l,
struct bkey_packed *r)
@@ -473,7 +473,7 @@ static inline int __btree_node_iter_cmp(struct btree_node_iter *iter,
* For extents, bkey_deleted() is used as a proxy for k->size == 0, so
* deleted keys have to sort last.
*/
- return bkey_cmp_packed(&b->format, l, r) ?: iter->is_extents
+ return bkey_cmp_packed(&b->format, l, r) ?: is_extents
? (int) bkey_deleted(l) - (int) bkey_deleted(r)
: (int) bkey_deleted(r) - (int) bkey_deleted(l);
}
@@ -483,7 +483,7 @@ static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
struct btree_node_iter_set l,
struct btree_node_iter_set r)
{
- return __btree_node_iter_cmp(iter, b,
+ return __btree_node_iter_cmp(iter->is_extents, b,
__btree_node_offset_to_key(b, l.k),
__btree_node_offset_to_key(b, r.k));
}
diff --git a/drivers/md/bcache/btree_cache.c b/drivers/md/bcache/btree_cache.c
index 42c2609db4b0..49d87cccaefe 100644
--- a/drivers/md/bcache/btree_cache.c
+++ b/drivers/md/bcache/btree_cache.c
@@ -166,7 +166,7 @@ static int mca_reap_notrace(struct cache_set *c, struct btree *b, bool flush)
* after the write, since this node is about to be evicted:
*/
if (btree_node_dirty(b))
- __bch_btree_node_write(c, b, NULL, -1);
+ __bch_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
/* wait for any in flight btree write */
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
@@ -507,12 +507,14 @@ out_unlock:
list_del_init(&b->list);
mutex_unlock(&c->btree_cache_lock);
out:
- b->flags = 0;
- b->written = 0;
- b->keys.nsets = 0;
- b->keys.set[0].data = NULL;
- b->sib_u64s[0] = 0;
- b->sib_u64s[1] = 0;
+ b->flags = 0;
+ b->written = 0;
+ b->keys.nsets = 0;
+ b->keys.set[0].data = NULL;
+ b->sib_u64s[0] = 0;
+ b->sib_u64s[1] = 0;
+ b->whiteout_u64s = 0;
+ b->uncompacted_whiteout_u64s = 0;
bch_btree_keys_init(&b->keys, &c->expensive_debug_checks);
bch_time_stats_update(&c->mca_alloc_time, start_time);
diff --git a/drivers/md/bcache/btree_io.c b/drivers/md/bcache/btree_io.c
index a467fd4c4421..4b5c4de6b256 100644
--- a/drivers/md/bcache/btree_io.c
+++ b/drivers/md/bcache/btree_io.c
@@ -16,6 +16,34 @@
#include <trace/events/bcache.h>
+static void verify_no_dups(struct btree_keys *b,
+ struct bkey_packed *start,
+ struct bkey_packed *end)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+ struct bkey_packed *k;
+
+ for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k))
+ BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
+#endif
+}
+
+static void clear_needs_whiteout(struct bset *i)
+{
+ struct bkey_packed *k;
+
+ for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k))
+ k->needs_whiteout = false;
+}
+
+static void set_needs_whiteout(struct bset *i)
+{
+ struct bkey_packed *k;
+
+ for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k))
+ k->needs_whiteout = true;
+}
+
static void btree_bounce_free(struct cache_set *c, unsigned order,
bool used_mempool, void *p)
{
@@ -127,101 +155,280 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
return ret;
}
-bool bch_maybe_compact_whiteouts(struct cache_set *c, struct btree *b)
+static inline int sort_key_whiteouts_cmp(struct btree_keys *b,
+ struct bkey_packed *l,
+ struct bkey_packed *r)
{
- struct bset_tree *t, *rebuild_from = NULL;
+ return bkey_cmp_packed(&b->format, l, r);
+}
+
+static unsigned sort_key_whiteouts(struct bkey_packed *dst,
+ struct sort_iter *iter)
+{
+ struct bkey_packed *in, *out = dst;
+
+ sort_iter_sort(iter, sort_key_whiteouts_cmp);
+
+ while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
+ bkey_copy(out, in);
+ out = bkey_next(out);
+ }
+
+ return (u64 *) out - (u64 *) dst;
+}
+
+static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
+ struct sort_iter *iter)
+{
+ struct bkey_packed *out = dst;
+
+ return (u64 *) out - (u64 *) dst;
+}
+
+enum compact_mode {
+ COMPACT_LAZY,
+ COMPACT_WRITTEN,
+ COMPACT_WRITTEN_NO_WRITE_LOCK,
+};
+
+static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
+ bool compacting,
+ enum compact_mode mode)
+{
+ unsigned live_u64s = b->keys.nr.bset_u64s[t - b->keys.set];
+ unsigned bset_u64s = le16_to_cpu(t->data->u64s);
+
+ if (live_u64s == bset_u64s)
+ return 0;
+
+ if (mode == COMPACT_LAZY) {
+ if (live_u64s * 4 < bset_u64s * 3 ||
+ (compacting && bset_unwritten(b, t->data)))
+ return bset_u64s - live_u64s;
+ } else {
+ if (bset_written(b, t->data))
+ return bset_u64s - live_u64s;
+ }
+
+ return 0;
+}
+
+static bool __compact_whiteouts(struct cache_set *c, struct btree *b,
+ enum compact_mode mode)
+{
+ const struct bkey_format *f = &b->keys.format;
+ struct bset_tree *t;
+ struct bkey_packed *whiteouts = NULL;
+ struct bkey_packed *u_start, *u_pos;
+ struct sort_iter sort_iter;
+ unsigned order, whiteout_u64s = 0, u64s;
+ bool used_mempool, compacting = false;
+
+ if (btree_node_is_extents(b))
+ return false;
+
+ for_each_bset(&b->keys, t)
+ whiteout_u64s += should_compact_bset(b, t,
+ whiteout_u64s != 0, mode);
+
+ if (!whiteout_u64s)
+ return false;
+
+ sort_iter_init(&sort_iter, &b->keys);
+
+ whiteout_u64s += b->whiteout_u64s;
+ order = get_order(whiteout_u64s * sizeof(u64));
+
+ whiteouts = btree_bounce_alloc(c, order, &used_mempool);
+ u_start = u_pos = whiteouts;
+
+ memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
+ b->whiteout_u64s);
+ u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
+
+ sort_iter_add(&sort_iter, u_start, u_pos);
for_each_bset(&b->keys, t) {
struct bset *i = t->data;
- struct bkey_packed *k, *n, *out = i->start;
- unsigned live_u64s = b->keys.nr.bset_u64s[t - b->keys.set];
+ struct bkey_packed *k, *n, *out, *start, *end;
+ struct btree_node_entry *src = NULL, *dst = NULL;
- if (live_u64s * 4 > le16_to_cpu(i->u64s) * 3)
- continue;
+ if (t != b->keys.set && bset_unwritten(b, i)) {
+ src = container_of(i, struct btree_node_entry, keys);
+ dst = max(write_block(b),
+ (void *) bset_bkey_last(t[-1].data));
+ }
- if (!bset_written(b, i))
+ if (!should_compact_bset(b, t, compacting, mode)) {
+ if (src != dst) {
+ memmove(dst, src, sizeof(*src) +
+ le16_to_cpu(src->keys.u64s) *
+ sizeof(u64));
+ t->data = &dst->keys;
+ }
continue;
+ }
+
+ compacting = true;
+ u_start = u_pos;
+ start = i->start;
+ end = bset_bkey_last(i);
- for (k = i->start; k != bset_bkey_last(i); k = n) {
+ if (src != dst) {
+ src = container_of(i, struct btree_node_entry, keys);
+ dst = max(write_block(b),
+ (void *) bset_bkey_last(t[-1].data));
+
+ memmove(dst, src, sizeof(*src));
+ i = t->data = &dst->keys;
+ }
+
+ out = i->start;
+
+ for (k = start; k != end; k = n) {
n = bkey_next(k);
- if (!bkey_whiteout(k)) {
- bkey_copy(out, k);
- out = bkey_next(out);
- } else if (!bset_written(b, i)) {
- /*
- * We cannot drop whiteouts that haven't been
- * written out - we can drop the value, though:
- *
- * XXX we could drop 0 size extents, if we fix
- * assertions elsewhere
- */
- set_bkeyp_val_u64s(&b->keys.format, k, 0);
+ if (bkey_deleted(k) && btree_node_is_extents(b))
+ continue;
+
+ if (bkey_whiteout(k) && !k->needs_whiteout)
+ continue;
+
+ if (bkey_whiteout(k)) {
+ unreserve_whiteout(b, t, k);
+ memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
+ set_bkeyp_val_u64s(f, u_pos, 0);
+ u_pos = bkey_next(u_pos);
+ } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
bkey_copy(out, k);
out = bkey_next(out);
}
}
- i->u64s = cpu_to_le16((u64 *) out - i->_data);
- bch_bset_set_no_aux_tree(&b->keys, t);
- if (!rebuild_from)
- rebuild_from = t;
+ sort_iter_add(&sort_iter, u_start, u_pos);
+
+ if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+ i->u64s = cpu_to_le16((u64 *) out - i->_data);
+ bch_bset_set_no_aux_tree(&b->keys, t);
+ }
}
- if (!rebuild_from)
- return false;
+ b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
- for (t = rebuild_from; t < b->keys.set + b->keys.nsets; t++) {
- /*
- * Compacting a set that hasn't been written out yet might mean
- * we need to shift the set after it down:
- */
- if (!bset_written(b, t->data) &&
- t != bset_tree_last(&b->keys)) {
- struct btree_node_entry *dst = (void *) b->data +
- (bset_end_sector(c, b, t->data) << 9);
- struct btree_node_entry *src = container_of(t[1].data,
- struct btree_node_entry, keys);
-
- if (dst != src) {
- memmove(dst, src,
- (void *) bset_bkey_last(&src->keys) -
- (void *) src);
- t[1].data = &dst->keys;
+ BUG_ON((void *) unwritten_whiteouts_start(c, b) <
+ (void *) bset_bkey_last(btree_bset_last(b)));
+
+ u64s = btree_node_is_extents(b)
+ ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
+ &sort_iter)
+ : sort_key_whiteouts(unwritten_whiteouts_start(c, b),
+ &sort_iter);
+
+ BUG_ON(u64s != b->whiteout_u64s);
+ verify_no_dups(&b->keys,
+ unwritten_whiteouts_start(c, b),
+ unwritten_whiteouts_end(c, b));
+
+ btree_bounce_free(c, order, used_mempool, whiteouts);
+
+ if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
+ bch_btree_build_aux_trees(b);
+
+ bch_btree_keys_u64s_remaining(c, b);
+ bch_verify_btree_nr_keys(&b->keys);
+
+ return true;
+}
+
+bool bch_maybe_compact_whiteouts(struct cache_set *c, struct btree *b)
+{
+ return __compact_whiteouts(c, b, COMPACT_LAZY);
+}
+
+static bool bch_drop_whiteouts(struct btree *b)
+{
+ struct bset_tree *t;
+ bool ret = false;
+
+ for_each_bset(&b->keys, t) {
+ struct bset *i = t->data;
+ struct bkey_packed *k, *n, *out, *start, *end;
+
+ if (!should_compact_bset(b, t, true, true))
+ continue;
+
+ start = i->start;
+ end = bset_bkey_last(i);
+
+ if (bset_unwritten(b, i) &&
+ t != b->keys.set) {
+ struct bset *dst =
+ max_t(struct bset *, write_block(b),
+ (void *) bset_bkey_last(t[-1].data));
+
+ memmove(dst, i, sizeof(struct bset));
+ i = t->data = dst;
+ }
+
+ out = i->start;
+
+ for (k = start; k != end; k = n) {
+ n = bkey_next(k);
+
+ if (!bkey_whiteout(k)) {
+ bkey_copy(out, k);
+ out = bkey_next(out);
}
}
+
+ i->u64s = cpu_to_le16((u64 *) out - i->_data);
+ bch_bset_set_no_aux_tree(&b->keys, t);
+ ret = true;
}
- bch_btree_build_aux_trees(b);
bch_verify_btree_nr_keys(&b->keys);
- return true;
+
+ return ret;
}
-static inline int sort_keys_cmp(struct btree_keys *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
+static int sort_keys_cmp(struct btree_keys *b,
+ struct bkey_packed *l,
+ struct bkey_packed *r)
{
return bkey_cmp_packed(&b->format, l, r) ?:
- (int) bkey_whiteout(r) - (int) bkey_whiteout(l);
+ (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
+ (int) l->needs_whiteout - (int) r->needs_whiteout;
}
static unsigned sort_keys(struct bkey_packed *dst,
struct sort_iter *iter,
bool filter_whiteouts)
{
+ const struct bkey_format *f = &iter->b->format;
struct bkey_packed *in, *next, *out = dst;
sort_iter_sort(iter, sort_keys_cmp);
while ((in = sort_iter_next(iter, sort_keys_cmp))) {
- if ((next = sort_iter_peek(iter)) &&
- !bkey_cmp_packed(&iter->b->format, in, next))
+ if (bkey_whiteout(in) &&
+ (filter_whiteouts || !in->needs_whiteout))
continue;
- if (filter_whiteouts && bkey_whiteout(in))
+ if (bkey_whiteout(in) &&
+ (next = sort_iter_peek(iter)) &&
+ !bkey_cmp_packed(f, in, next)) {
+ BUG_ON(in->needs_whiteout &&
+ next->needs_whiteout);
+ next->needs_whiteout |= in->needs_whiteout;
continue;
+ }
- bkey_copy(out, in);
+ if (bkey_whiteout(in)) {
+ memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
+ set_bkeyp_val_u64s(f, out, 0);
+ } else {
+ bkey_copy(out, in);
+ }
out = bkey_next(out);
}
@@ -262,14 +469,14 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
struct btree_iter *iter,
unsigned start_idx,
unsigned end_idx,
- bool is_write_locked)
+ bool filter_whiteouts)
{
struct btree_node *out;
struct sort_iter sort_iter;
struct bset_tree *t;
- bool used_mempool = false, filter_whiteouts;
+ bool used_mempool = false;
u64 start_time;
- unsigned i, u64s, order, shift = end_idx - start_idx - 1;
+ unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
bool sorting_entire_node = start_idx == 0 &&
end_idx == b->keys.nsets;
@@ -291,7 +498,8 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
start_time = local_clock();
- filter_whiteouts = bset_written(b, b->keys.set[start_idx].data);
+ if (btree_node_is_extents(b))
+ filter_whiteouts = bset_written(b, b->keys.set[start_idx].data);
u64s = btree_node_is_extents(b)
? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
@@ -305,9 +513,6 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
if (sorting_entire_node)
bch_time_stats_update(&c->btree_sort_time, start_time);
- if (!is_write_locked)
- __btree_node_lock_write(b, iter);
-
/* Make sure we preserve bset journal_seq: */
for (t = b->keys.set + start_idx + 1;
t < b->keys.set + end_idx;
@@ -353,9 +558,6 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
bch_bset_set_no_aux_tree(&b->keys, &b->keys.set[start_idx]);
- if (!is_write_locked)
- __btree_node_unlock_write(b, iter);
-
btree_bounce_free(c, order, used_mempool, out);
bch_verify_btree_nr_keys(&b->keys);
@@ -513,7 +715,7 @@ static bool btree_node_compact(struct cache_set *c, struct btree *b,
for (unwritten_idx = 0;
unwritten_idx < b->keys.nsets;
unwritten_idx++)
- if (!bset_written(b, b->keys.set[unwritten_idx].data))
+ if (bset_unwritten(b, b->keys.set[unwritten_idx].data))
break;
if (b->keys.nsets - unwritten_idx > 1) {
@@ -552,39 +754,23 @@ void bch_btree_build_aux_trees(struct btree *b)
void bch_btree_init_next(struct cache_set *c, struct btree *b,
struct btree_iter *iter)
{
- struct bset *i;
+ struct btree_node_entry *bne;
bool did_sort;
- unsigned offset;
- BUG_ON(iter && iter->nodes[b->level] != b);
+ EBUG_ON(!(b->lock.state.seq & 1));
+ EBUG_ON(iter && iter->nodes[b->level] != b);
did_sort = btree_node_compact(c, b, iter);
- /* do verify if we sorted down to a single set: */
if (did_sort && b->keys.nsets == 1)
bch_btree_verify(c, b);
- __btree_node_lock_write(b, iter);
-
- /*
- * If the last bset has been written, or if it's gotten too big - start
- * a new bset to insert into:
- */
- i = btree_bset_last(b);
- offset = next_bset_offset(c, b);
-
- if (offset < c->sb.btree_node_size &&
- (bset_written(b, i) ||
- le16_to_cpu(i->u64s) * sizeof(u64) > BTREE_WRITE_SET_BUFFER)) {
- struct btree_node_entry *bne = (void *) b->data + (offset << 9);
-
+ bne = want_new_bset(c, b);
+ if (bne)
bch_bset_init_next(&b->keys, &bne->keys);
- }
bch_btree_build_aux_trees(b);
- __btree_node_unlock_write(b, iter);
-
if (iter && did_sort)
bch_btree_iter_reinit_node(iter, b);
}
@@ -839,6 +1025,8 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
bch_bset_build_aux_tree(&b->keys, b->keys.set, false);
+ set_needs_whiteout(b->keys.set->data);
+
btree_node_reset_sib_u64s(b);
err = "short btree key";
@@ -981,7 +1169,10 @@ static void btree_node_write_endio(struct bio *bio)
set_btree_node_write_error(b);
if (wbio->bounce)
- bch_bio_free_pages_pool(c, bio);
+ btree_bounce_free(c,
+ wbio->order,
+ wbio->used_mempool,
+ page_address(bio->bi_io_vec[0].bv_page));
if (wbio->put_bio)
bio_put(bio);
@@ -1000,17 +1191,23 @@ static void btree_node_write_endio(struct bio *bio)
void __bch_btree_node_write(struct cache_set *c, struct btree *b,
struct closure *parent,
+ enum six_lock_type lock_type_held,
int idx_to_write)
{
struct bio *bio;
struct bch_write_bio *wbio;
struct bset_tree *t;
- struct bset *last_set_to_write = NULL;
+ struct bset *i;
+ struct btree_node *bn = NULL;
+ struct btree_node_entry *bne = NULL;
BKEY_PADDED(key) k;
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
struct cache *ca;
- size_t sectors_to_write;
+ struct sort_iter sort_iter;
+ unsigned sectors_to_write, order, bytes, u64s;
+ u64 seq = 0;
+ bool used_mempool;
void *data;
/*
@@ -1025,6 +1222,8 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
btree_node_io_lock(b);
+ set_btree_node_just_written(b);
+
BUG_ON(!list_empty(&b->write_blocked));
#if 0
/*
@@ -1050,55 +1249,81 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
change_bit(BTREE_NODE_write_idx, &b->flags);
- for_each_bset(&b->keys, t)
- if (!bset_written(b, t->data))
- break;
-
- if (t != bset_tree_last(&b->keys)) {
- /*
- * See if we can get a write lock and compact these bsets
- * together:
- */
+ if (lock_type_held == SIX_LOCK_intent) {
+ six_lock_write(&b->lock);
+ __compact_whiteouts(c, b, COMPACT_WRITTEN);
+ six_unlock_write(&b->lock);
+ } else {
+ __compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
}
+ BUG_ON(b->uncompacted_whiteout_u64s);
+
+ sort_iter_init(&sort_iter, &b->keys);
+
+ bytes = !b->written
+ ? sizeof(struct btree_node)
+ : sizeof(struct btree_node_entry);
+
+ bytes += b->whiteout_u64s * sizeof(u64);
+ sort_iter_add(&sort_iter,
+ unwritten_whiteouts_start(c, b),
+ unwritten_whiteouts_end(c, b));
+ b->whiteout_u64s = 0;
+
for_each_bset(&b->keys, t) {
- struct bset *i = t->data;
+ i = t->data;
if (bset_written(b, i))
continue;
- BUG_ON(i->seq != b->data->keys.seq);
- BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
- BUG_ON(i != btree_bset_last(b) &&
- bset_end_sector(c, b, i) << 9 !=
- bset_byte_offset(b, container_of(t[1].data,
- struct btree_node_entry, keys)));
+ bytes += le16_to_cpu(i->u64s) * sizeof(u64);
+ sort_iter_add(&sort_iter, i->start, bset_bkey_last(i));
+ seq = max(seq, le64_to_cpu(i->journal_seq));
+ }
- /*
- * We may have merely initialized a bset but not written
- * anything to it yet:
- */
- if (i != btree_bset_first(b) && !i->u64s)
- continue;
+ order = get_order(bytes);
+ data = btree_bounce_alloc(c, order, &used_mempool);
- last_set_to_write = i;
+ if (!b->written) {
+ bn = data;
+ *bn = *b->data;
+ i = &bn->keys;
+ } else {
+ bne = data;
+ bne->keys = b->data->keys;
+ i = &bne->keys;
+ }
- i->version = cpu_to_le16(BCACHE_BSET_VERSION);
- SET_BSET_CSUM_TYPE(i, c->opts.metadata_checksum);
+ i->journal_seq = cpu_to_le64(seq);
- if (t == b->keys.set) {
- b->data->csum = cpu_to_le64(btree_csum_set(b, b->data));
- } else {
- struct btree_node_entry *bne = container_of(i,
- struct btree_node_entry, keys);
+ u64s = btree_node_is_extents(b)
+ ? sort_extents(i->start, &sort_iter, false)
+ : sort_keys(i->start, &sort_iter, false);
+ i->u64s = cpu_to_le16(u64s);
- bne->csum = cpu_to_le64(btree_csum_set(b, bne));
- }
+ clear_needs_whiteout(i);
+
+ if (b->written && !i->u64s) {
+ /* Nothing to write: */
+ btree_bounce_free(c, order, used_mempool, data);
+ btree_node_write_done(c, b);
+ return;
}
- BUG_ON(!last_set_to_write);
- sectors_to_write = bset_end_sector(c, b, last_set_to_write) - b->written;
- data = write_block(b);
+ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
+ BUG_ON(i->seq != b->data->keys.seq);
+
+ i->version = cpu_to_le16(BCACHE_BSET_VERSION);
+ SET_BSET_CSUM_TYPE(i, c->opts.metadata_checksum);
+
+ if (bn)
+ bn->csum = cpu_to_le64(btree_csum_set(b, bn));
+ else
+ bne->csum = cpu_to_le64(btree_csum_set(b, bne));
+
+ sectors_to_write = round_up((void *) bset_bkey_last(i) - data,
+ block_bytes(c)) >> 9;
BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
@@ -1119,16 +1344,20 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
set_btree_node_write_error(b);
b->written += sectors_to_write;
+ btree_bounce_free(c, order, used_mempool, data);
btree_node_write_done(c, b);
return;
}
- bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->bio_write);
+ bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
wbio = to_wbio(bio);
wbio->cl = parent;
wbio->bounce = true;
wbio->put_bio = true;
+ wbio->order = order;
+ wbio->used_mempool = used_mempool;
+ bio->bi_iter.bi_size = sectors_to_write << 9;
bio->bi_end_io = btree_node_write_endio;
bio->bi_private = b;
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
@@ -1136,8 +1365,7 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
if (parent)
closure_get(parent);
- bch_bio_alloc_pages_pool(c, bio, sectors_to_write << 9);
- memcpy_to_bio(bio, bio->bi_iter, data);
+ bch_bio_map(bio, data);
/*
* If we're appending to a leaf node, we don't technically need FUA -
@@ -1170,6 +1398,64 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
bch_submit_wbio_replicas(wbio, c, &k.key, true);
}
+/*
+ * Work that must be done with write lock held:
+ */
+bool bch_btree_post_write_cleanup(struct cache_set *c, struct btree *b)
+{
+ bool invalidated_iter = false;
+ struct btree_node_entry *bne;
+ struct bset_tree *t;
+
+ if (!btree_node_just_written(b))
+ return false;
+
+ BUG_ON(b->whiteout_u64s);
+ BUG_ON(b->uncompacted_whiteout_u64s);
+
+ clear_btree_node_just_written(b);
+
+ /*
+ * Note: immediately after write, bset_unwritten()/bset_written() don't
+ * work - the amount of data we had to write after compaction might have
+ * been smaller than the offset of the last bset.
+ *
+ * However, we know that all bsets have been written here, as long as
+ * we're still holding the write lock:
+ */
+
+ /*
+ * XXX: decide if we really want to unconditionally sort down to a
+ * single bset:
+ */
+ if (b->keys.nsets > 1) {
+ btree_node_sort(c, b, NULL, 0, b->keys.nsets, true);
+ invalidated_iter = true;
+ } else {
+ invalidated_iter = bch_drop_whiteouts(b);
+ }
+
+ for_each_bset(&b->keys, t)
+ set_needs_whiteout(t->data);
+
+ /*
+ * If later we don't unconditionally sort down to a single bset, we have
+ * to ensure this is still true:
+ */
+ BUG_ON((void *) bset_bkey_last(btree_bset_last(b)) > write_block(b));
+
+ bne = want_new_bset(c, b);
+ if (bne)
+ bch_bset_init_next(&b->keys, &bne->keys);
+
+ bch_btree_build_aux_trees(b);
+
+ return invalidated_iter;
+}
+
+/*
+ * Use this one if the node is intent locked:
+ */
void bch_btree_node_write(struct cache_set *c, struct btree *b,
struct closure *parent,
enum six_lock_type lock_type_held,
@@ -1177,15 +1463,19 @@ void bch_btree_node_write(struct cache_set *c, struct btree *b,
{
BUG_ON(lock_type_held == SIX_LOCK_write);
- __bch_btree_node_write(c, b, parent, -1);
-
if (lock_type_held == SIX_LOCK_intent ||
six_trylock_convert(&b->lock, SIX_LOCK_read,
SIX_LOCK_intent)) {
- bch_btree_init_next(c, b, NULL);
+ __bch_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write);
+
+ six_lock_write(&b->lock);
+ bch_btree_post_write_cleanup(c, b);
+ six_unlock_write(&b->lock);
if (lock_type_held == SIX_LOCK_read)
six_lock_downgrade(&b->lock);
+ } else {
+ __bch_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write);
}
}
diff --git a/drivers/md/bcache/btree_io.h b/drivers/md/bcache/btree_io.h
index 55c021acd9d9..9b66e707d4b0 100644
--- a/drivers/md/bcache/btree_io.h
+++ b/drivers/md/bcache/btree_io.h
@@ -37,7 +37,9 @@ void bch_btree_complete_write(struct cache_set *, struct btree *,
struct btree_write *);
void __bch_btree_node_write(struct cache_set *, struct btree *,
- struct closure *, int);
+ struct closure *, enum six_lock_type, int);
+bool bch_btree_post_write_cleanup(struct cache_set *, struct btree *);
+
void bch_btree_node_write(struct cache_set *, struct btree *,
struct closure *, enum six_lock_type, int);
@@ -45,7 +47,4 @@ void bch_btree_flush(struct cache_set *);
void bch_btree_node_flush_journal_entries(struct cache_set *, struct btree *,
struct closure *);
-/* buffer up 4k before writing out a bset */
-#define BTREE_WRITE_SET_BUFFER (4 << 10)
-
#endif /* _BCACHE_BTREE_IO_H */
diff --git a/drivers/md/bcache/btree_types.h b/drivers/md/bcache/btree_types.h
index 5b22b84188df..0d711ec91c20 100644
--- a/drivers/md/bcache/btree_types.h
+++ b/drivers/md/bcache/btree_types.h
@@ -46,6 +46,12 @@ struct btree {
u8 level;
u8 btree_id;
u16 sib_u64s[2];
+ u16 whiteout_u64s;
+ u16 uncompacted_whiteout_u64s;
+
+ struct btree_keys keys;
+ struct btree_node *data;
+
/*
* XXX: add a delete sequence number, so when btree_node_relock() fails
* because the lock sequence number has changed - i.e. the contents were
@@ -64,9 +70,6 @@ struct btree {
*/
struct list_head write_blocked;
- struct btree_keys keys;
- struct btree_node *data;
-
struct open_bucket *ob;
/* lru list */
@@ -92,6 +95,7 @@ enum btree_flags {
BTREE_NODE_write_idx,
BTREE_NODE_accessed,
BTREE_NODE_write_in_flight,
+ BTREE_NODE_just_written,
};
BTREE_FLAG(read_error);
@@ -100,6 +104,7 @@ BTREE_FLAG(dirty);
BTREE_FLAG(write_idx);
BTREE_FLAG(accessed);
BTREE_FLAG(write_in_flight);
+BTREE_FLAG(just_written);
static inline struct btree_write *btree_current_write(struct btree *b)
{
diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c
index a4e65a896f21..f3579b8a678c 100644
--- a/drivers/md/bcache/btree_update.c
+++ b/drivers/md/bcache/btree_update.c
@@ -655,7 +655,7 @@ static void bch_insert_fixup_btree_ptr(struct btree_iter *iter,
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
-void bch_btree_bset_insert_key(struct btree_iter *iter,
+bool bch_btree_bset_insert_key(struct btree_iter *iter,
struct btree *b,
struct btree_node_iter *node_iter,
struct bkey_i *insert)
@@ -665,27 +665,69 @@ void bch_btree_bset_insert_key(struct btree_iter *iter,
struct bset_tree *t;
unsigned clobber_u64s;
+ EBUG_ON(btree_node_just_written(b));
+ EBUG_ON(bset_written(b, btree_bset_last(b)));
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
- EBUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->c, b));
EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
bkey_cmp(insert->k.p, b->data->max_key) > 0);
+ BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->c, b));
k = bch_btree_node_iter_peek_all(node_iter, &b->keys);
if (k && !bkey_cmp_packed(f, k, &insert->k)) {
+ BUG_ON(bkey_whiteout(k));
+
t = bch_bkey_to_bset(&b->keys, k);
- if (!bkey_whiteout(k))
- btree_keys_account_key_drop(&b->keys.nr,
- t - b->keys.set, k);
+ if (bset_unwritten(b, t->data) &&
+ bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
+ BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k));
+
+ k->type = insert->k.type;
+ memcpy_u64s(bkeyp_val(f, k), &insert->v,
+ bkey_val_u64s(&insert->k));
+ return true;
+ }
+
+ insert->k.needs_whiteout = k->needs_whiteout;
+
+ btree_keys_account_key_drop(&b->keys.nr, t - b->keys.set, k);
if (t == bset_tree_last(&b->keys)) {
clobber_u64s = k->u64s;
+
+ /*
+ * If we're deleting, and the key we're deleting doesn't
+ * need a whiteout (it wasn't overwriting a key that had
+ * been written to disk) - just delete it:
+ */
+ if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
+ bch_bset_delete(&b->keys, k, clobber_u64s);
+ bch_btree_node_iter_fix(iter, b, node_iter, t,
+ k, clobber_u64s, 0);
+ return true;
+ }
+
goto overwrite;
}
k->type = KEY_TYPE_DELETED;
bch_btree_node_iter_fix(iter, b, node_iter, t, k,
k->u64s, k->u64s);
+
+ if (bkey_whiteout(&insert->k)) {
+ reserve_whiteout(b, t, k);
+ return true;
+ } else {
+ k->needs_whiteout = false;
+ }
+ } else {
+ /*
+ * Deleting, but the key to delete wasn't found - nothing to do:
+ */
+ if (bkey_whiteout(&insert->k))
+ return false;
+
+ insert->k.needs_whiteout = false;
}
t = bset_tree_last(&b->keys);
@@ -693,9 +735,10 @@ void bch_btree_bset_insert_key(struct btree_iter *iter,
clobber_u64s = 0;
overwrite:
bch_bset_insert(&b->keys, node_iter, k, insert, clobber_u64s);
- if (k->u64s != clobber_u64s || bkey_deleted(&insert->k))
+ if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
bch_btree_node_iter_fix(iter, b, node_iter, t, k,
clobber_u64s, k->u64s);
+ return true;
}
static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
@@ -720,7 +763,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
* shouldn't:
*/
if (!b->level)
- __bch_btree_node_write(c, b, NULL, i);
+ bch_btree_node_write(c, b, NULL, SIX_LOCK_read, i);
six_unlock_read(&b->lock);
}
@@ -755,9 +798,13 @@ void bch_btree_journal_key(struct btree_insert *trans,
if (trans->journal_res.ref) {
u64 seq = bch_journal_res_seq(j, &trans->journal_res);
+ bool needs_whiteout = insert->k.needs_whiteout;
+ /* ick */
+ insert->k.needs_whiteout = false;
bch_journal_add_keys(j, &trans->journal_res,
b->btree_id, insert);
+ insert->k.needs_whiteout = needs_whiteout;
if (trans->journal_seq)
*trans->journal_seq = seq;
@@ -776,11 +823,11 @@ bch_insert_fixup_key(struct btree_insert *trans,
BUG_ON(iter->level);
- bch_btree_journal_key(trans, iter, insert->k);
- bch_btree_bset_insert_key(iter,
- iter->nodes[0],
- &iter->node_iters[0],
- insert->k);
+ if (bch_btree_bset_insert_key(iter,
+ iter->nodes[0],
+ &iter->node_iters[0],
+ insert->k))
+ bch_btree_journal_key(trans, iter, insert->k);
trans->did_work = true;
return BTREE_INSERT_OK;
@@ -800,22 +847,19 @@ static void verify_keys_sorted(struct keylist *l)
static void btree_node_lock_for_insert(struct btree *b, struct btree_iter *iter)
{
struct cache_set *c = iter->c;
- struct bset *i;
-relock:
+
btree_node_lock_write(b, iter);
- i = btree_bset_last(b);
+
+ if (btree_node_just_written(b) &&
+ bch_btree_post_write_cleanup(c, b))
+ bch_btree_iter_reinit_node(iter, b);
/*
* If the last bset has been written, or if it's gotten too big - start
* a new bset to insert into:
*/
- if (next_bset_offset(c, b) < c->sb.btree_node_size &&
- (bset_written(b, i) ||
- le16_to_cpu(i->u64s) * sizeof(u64) > BTREE_WRITE_SET_BUFFER)) {
- btree_node_unlock_write(b, iter);
+ if (want_new_bset(c, b))
bch_btree_init_next(c, b, iter);
- goto relock;
- }
}
/* Asynchronous interior node update machinery */
@@ -1877,6 +1921,12 @@ retry:
if (!same_leaf_as_prev(trans, i))
u64s = 0;
+ /*
+ * bch_btree_node_insert_fits() must be called under write lock:
+ * with only an intent lock, another thread can still call
+ * bch_btree_node_write(), converting an unwritten bset to a
+ * written one
+ */
if (!i->done) {
u64s += i->k->k.u64s;
if (!bch_btree_node_insert_fits(c,
diff --git a/drivers/md/bcache/btree_update.h b/drivers/md/bcache/btree_update.h
index 4846c4790b4f..65b0421b6b9c 100644
--- a/drivers/md/bcache/btree_update.h
+++ b/drivers/md/bcache/btree_update.h
@@ -166,11 +166,28 @@ int bch_btree_root_alloc(struct cache_set *, enum btree_id, struct closure *);
/* Inserting into a given leaf node (last stage of insert): */
-void bch_btree_bset_insert_key(struct btree_iter *, struct btree *,
+bool bch_btree_bset_insert_key(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bkey_i *);
void bch_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
struct bkey_i *);
+static inline void *btree_data_end(struct cache_set *c, struct btree *b)
+{
+ return (void *) b->data + btree_bytes(c);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_start(struct cache_set *c,
+ struct btree *b)
+{
+ return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_end(struct cache_set *c,
+ struct btree *b)
+{
+ return btree_data_end(c, b);
+}
+
static inline void *write_block(struct btree *b)
{
return (void *) b->data + (b->written << 9);
@@ -193,21 +210,13 @@ static inline unsigned bset_end_sector(struct cache_set *c, struct btree *b,
block_bytes(c)) >> 9;
}
-static inline unsigned next_bset_offset(struct cache_set *c, struct btree *b)
-{
-
- unsigned offset = max_t(unsigned, b->written,
- bset_end_sector(c, b, btree_bset_last(b)));
-
- EBUG_ON(offset > c->sb.btree_node_size);
- return offset;
-}
-
static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c,
struct btree *b)
{
struct bset *i = btree_bset_last(b);
- unsigned used = bset_byte_offset(b, bset_bkey_last(i)) / sizeof(u64);
+ unsigned used = bset_byte_offset(b, bset_bkey_last(i)) / sizeof(u64) +
+ b->whiteout_u64s +
+ b->uncompacted_whiteout_u64s;
unsigned total = c->sb.btree_node_size << 6;
EBUG_ON(used > total);
@@ -218,6 +227,36 @@ static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c,
return total - used;
}
+static inline unsigned btree_write_set_buffer(struct btree *b)
+{
+ /*
+ * Could buffer up larger amounts of keys for btrees with larger keys,
+ * pending benchmarking:
+ */
+ return 4 << 10;
+}
+
+static inline struct btree_node_entry *want_new_bset(struct cache_set *c,
+ struct btree *b)
+{
+ struct bset *i = btree_bset_last(b);
+ unsigned offset = max_t(unsigned, b->written << 9,
+ bset_byte_offset(b, bset_bkey_last(i)));
+ ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t)
+ (offset + sizeof(struct btree_node_entry) +
+ b->whiteout_u64s * sizeof(u64) +
+ b->uncompacted_whiteout_u64s * sizeof(u64));
+
+ EBUG_ON(offset > btree_bytes(c));
+
+ if ((unlikely(bset_written(b, i)) && n > 0) ||
+ (unlikely(__set_bytes(i, le16_to_cpu(i->u64s)) >
+ btree_write_set_buffer(b)) && n > btree_write_set_buffer(b)))
+ return (void *) b->data + offset;
+
+ return NULL;
+}
+
/*
* write lock must be held on @b (else the dirty bset that we were going to
* insert into could be written out from under us)
@@ -235,6 +274,27 @@ static inline bool bch_btree_node_insert_fits(struct cache_set *c,
return u64s <= bch_btree_keys_u64s_remaining(c, b);
}
+static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
+ struct bkey_packed *k)
+{
+ if (bset_written(b, t->data)) {
+ EBUG_ON(b->uncompacted_whiteout_u64s <
+ bkeyp_key_u64s(&b->keys.format, k));
+ b->uncompacted_whiteout_u64s -=
+ bkeyp_key_u64s(&b->keys.format, k);
+ }
+}
+
+static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
+ struct bkey_packed *k)
+{
+ if (bset_written(b, t->data)) {
+ BUG_ON(!k->needs_whiteout);
+ b->uncompacted_whiteout_u64s +=
+ bkeyp_key_u64s(&b->keys.format, k);
+ }
+}
+
void bch_btree_insert_node(struct btree *, struct btree_iter *,
struct keylist *, struct btree_reserve *,
struct btree_interior_update *as);
diff --git a/drivers/md/bcache/io_types.h b/drivers/md/bcache/io_types.h
index 2760d0f0edd8..1cea24165f0e 100644
--- a/drivers/md/bcache/io_types.h
+++ b/drivers/md/bcache/io_types.h
@@ -74,6 +74,10 @@ struct bch_write_bio {
bounce:1,
put_bio:1;
+ /* Only for btree writes: */
+ unsigned used_mempool:1;
+ u8 order;
+
struct bio bio;
};
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 5f2558b7a096..2f1cc61be66e 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -109,7 +109,13 @@ struct bkey {
__u8 u64s;
/* Format of key (0 for format local to btree node) */
- __u8 format;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u8 format:7,
+ needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u8 needs_whiteout:1,
+ format:7;
+#endif
/* Type of the value */
__u8 type;
@@ -136,7 +142,19 @@ struct bkey_packed {
__u8 u64s;
/* Format of key (0 for format local to btree node) */
- __u8 format;
+
+ /*
+ * XXX: next incompat on disk format change, switch format and
+ * needs_whiteout - bkey_packed() will be cheaper if format is the high
+ * bits of the bitfield
+ */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u8 format:7,
+ needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u8 needs_whiteout:1,
+ format:7;
+#endif
/* Type of the value */
__u8 type;
@@ -256,6 +274,7 @@ struct bkey_i_##name { \
#define KEY_TYPE_DISCARD 1
#define KEY_TYPE_ERROR 2
#define KEY_TYPE_COOKIE 3
+#define KEY_TYPE_PERSISTENT_DISCARD 4
#define KEY_TYPE_GENERIC_NR 128
struct bch_cookie {