summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2015-05-28 23:57:20 -0700
committerKent Overstreet <kent.overstreet@gmail.com>2016-10-07 12:34:20 -0800
commit5749e6138348d7c1546e28b4ac0ae9032c94e0c0 (patch)
treeac6f6ce3dced78b075c0b54ba5e0ad9a728a9951
parent9d5c579320bcf93315155a30a1b6b975f2811468 (diff)
bcache: data checksumming
-rw-r--r--drivers/md/bcache/Kconfig2
-rw-r--r--drivers/md/bcache/alloc.c95
-rw-r--r--drivers/md/bcache/alloc.h6
-rw-r--r--drivers/md/bcache/bcache.h9
-rw-r--r--drivers/md/bcache/bset.h26
-rw-r--r--drivers/md/bcache/btree.c11
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/buckets.c15
-rw-r--r--drivers/md/bcache/buckets.h15
-rw-r--r--drivers/md/bcache/debug.c2
-rw-r--r--drivers/md/bcache/extents.c680
-rw-r--r--drivers/md/bcache/extents.h290
-rw-r--r--drivers/md/bcache/io.c1067
-rw-r--r--drivers/md/bcache/io.h55
-rw-r--r--drivers/md/bcache/io_types.h12
-rw-r--r--drivers/md/bcache/journal.c31
-rw-r--r--drivers/md/bcache/migrate.c8
-rw-r--r--drivers/md/bcache/move.c14
-rw-r--r--drivers/md/bcache/request.c71
-rw-r--r--drivers/md/bcache/super.c30
-rw-r--r--drivers/md/bcache/tier.c27
-rw-r--r--include/trace/events/bcache.h10
-rw-r--r--include/uapi/linux/bcache.h156
23 files changed, 1983 insertions, 651 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 55e135f6dd61..0f9410c06c45 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -3,6 +3,8 @@ config BCACHE
tristate "Block device as cache"
select LIBCRC32C
select FS_POSIX_ACL
+ select ZLIB_INFLATE
+ select ZLIB_DEFLATE
---help---
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index d567f4ae6df3..7b51888c5968 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -1271,59 +1271,43 @@ static void verify_not_stale(struct cache_set *c, const struct open_bucket *ob)
}
/*
- * Allocates some space in the cache to write to, and k to point to the newly
- * allocated space, and updates k->size and k->offset (to point to the
- * end of the newly allocated space).
- *
- * May allocate fewer sectors than @sectors, k->size indicates how many
- * sectors were actually allocated.
- *
- * Return codes:
- * - -EAGAIN: closure was added to waitlist
- * - -ENOSPC: out of space and no closure provided
- *
- * @c - cache set.
- * @wp - write point to use for allocating sectors.
- * @k - key to return the allocated space information.
- * @cl - closure to wait for a bucket
+ * Get us an open_bucket we can allocate from, return with it locked:
*/
-struct open_bucket *bch_alloc_sectors(struct cache_set *c,
- struct write_point *wp,
- struct bkey_i *k,
- bool check_enospc,
- struct closure *cl)
+struct open_bucket *bch_alloc_sectors_start(struct cache_set *c,
+ struct write_point *wp,
+ bool check_enospc,
+ struct closure *cl)
{
- struct bkey_s_extent dst;
- struct bch_extent_ptr *ptr;
struct open_bucket *ob;
- struct cache *ca;
- unsigned sectors;
ob = lock_and_refill_writepoint(c, wp, check_enospc, cl);
if (IS_ERR_OR_NULL(ob))
return ob;
BUG_ON(!ob->sectors_free);
-
verify_not_stale(c, ob);
+ return ob;
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch_alloc_sectors_done(struct cache_set *c, struct write_point *wp,
+ struct bkey_i *k, struct open_bucket *ob,
+ unsigned sectors)
+{
+ struct bch_extent_ptr *ptr;
+ struct cache *ca;
+ unsigned i;
+
/*
* We're keeping any existing pointer k has, and appending new pointers:
* __bch_write() will only write to the pointers we add here:
*/
- dst = bkey_i_to_s_extent(k);
-
- /* Set up the pointer to the space we're allocating: */
- memcpy(&dst.v->ptr[bch_extent_ptrs(dst)],
- ob->ptrs, ob->nr_ptrs * sizeof(u64));
-
- bch_set_extent_ptrs(dst, bch_extent_ptrs(dst) + ob->nr_ptrs);
-
- sectors = min_t(unsigned, dst.k->size, ob->sectors_free);
-
- bch_key_resize(dst.k, sectors);
-
- /* update open bucket for next time: */
+ for (i = 0; i < ob->nr_ptrs; i++)
+ extent_ptr_append(bkey_i_to_extent(k), ob->ptrs[i]);
ob->sectors_free -= sectors;
if (ob->sectors_free)
@@ -1341,6 +1325,41 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c,
rcu_read_unlock();
mutex_unlock(&ob->lock);
+}
+
+/*
+ * Allocates some space in the cache to write to, and k to point to the newly
+ * allocated space, and updates k->size and k->offset (to point to the
+ * end of the newly allocated space).
+ *
+ * May allocate fewer sectors than @sectors, k->size indicates how many
+ * sectors were actually allocated.
+ *
+ * Return codes:
+ * - -EAGAIN: closure was added to waitlist
+ * - -ENOSPC: out of space and no closure provided
+ *
+ * @c - cache set.
+ * @wp - write point to use for allocating sectors.
+ * @k - key to return the allocated space information.
+ * @cl - closure to wait for a bucket
+ */
+struct open_bucket *bch_alloc_sectors(struct cache_set *c,
+ struct write_point *wp,
+ struct bkey_i *k,
+ bool check_enospc,
+ struct closure *cl)
+{
+ struct open_bucket *ob;
+
+ ob = bch_alloc_sectors_start(c, wp, check_enospc, cl);
+ if (IS_ERR_OR_NULL(ob))
+ return ob;
+
+ if (k->k.size > ob->sectors_free)
+ bch_key_resize(&k->k, ob->sectors_free);
+
+ bch_alloc_sectors_done(c, wp, k, ob, k->k.size);
return ob;
}
diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h
index 0ab405a19da9..c0118db8440e 100644
--- a/drivers/md/bcache/alloc.h
+++ b/drivers/md/bcache/alloc.h
@@ -16,6 +16,12 @@ void bch_prio_timer_start(struct cache_set *, int);
void bch_open_bucket_put(struct cache_set *, struct open_bucket *);
+struct open_bucket *bch_alloc_sectors_start(struct cache_set *,
+ struct write_point *,
+ bool, struct closure *);
+void bch_alloc_sectors_done(struct cache_set *, struct write_point *,
+ struct bkey_i *, struct open_bucket *, unsigned);
+
struct open_bucket *bch_alloc_sectors(struct cache_set *, struct write_point *,
struct bkey_i *, bool, struct closure *);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 807278e80500..a160f5946c6e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -213,6 +213,7 @@
#include "blockdev_types.h"
#include "buckets_types.h"
#include "clock_types.h"
+#include "io_types.h"
#include "journal_types.h"
#include "keylist_types.h"
#include "keybuf_types.h"
@@ -418,8 +419,6 @@ struct cache_set {
struct closure sb_write;
struct semaphore sb_write_mutex;
- struct bio_set bio_split;
-
struct backing_dev_info bdi;
/* BTREE CACHE */
@@ -563,7 +562,13 @@ struct cache_set {
struct rw_semaphore gc_lock;
/* IO PATH */
+ struct bio_set bio_read;
struct bio_set bio_write;
+ struct mutex bio_bounce_pages_lock;
+ mempool_t bio_bounce_pages;
+ mempool_t compression_workspace_pool;
+ struct bio_decompress_worker __percpu
+ *bio_decompress_worker;
/* For punting bio submissions to workqueue, io.c */
struct bio_list bio_submit_list;
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 1cb60b65322a..4d8fb84d8c55 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -324,10 +324,30 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n,
/* Bkey utility code */
-#define BKEY_EXTENT_PTRS_MAX 4
-#define BKEY_EXTENT_MAX_U64s (BKEY_U64s + BKEY_EXTENT_PTRS_MAX)
+/* Amount of space we might need, in order to add a single pointer */
-#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_PTRS_MAX)
+/* XXX: move constants to uapi/linux/bcache.h */
+
+#define BKEY_EXTENT_PTR_MAX_U64s \
+ ((sizeof(struct bch_extent_crc64) + \
+ sizeof(struct bch_extent_ptr)) / sizeof(u64))
+
+#define BKEY_EXTENT_PTRS_MAX 4
+
+#if 0
+#define BKEY_EXTENT_VAL_U64s_MAX \
+ ((sizeof(struct bch_extent_crc64) +
+ sizeof(struct bch_extent_ptr)) * BKEY_EXTENT_PTRS_MAX)
+#else
+#define BKEY_EXTENT_VAL_U64s_MAX 8
+#endif
+
+#define BKEY_EXTENT_MAX_U64s (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+#define BKEY_BTREE_PTR_VAL_U64s_MAX BKEY_EXTENT_PTRS_MAX
+#define BKEY_BTREE_PTR_U64s_MAX (BKEY_U64s + BKEY_EXTENT_PTRS_MAX)
+
+#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
#define __bkey_idx(_set, _offset) \
((_set)->_data + (_offset))
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index eac7354c572f..e422c4b2b0e2 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -335,6 +335,12 @@ static void bch_btree_init_next(struct cache_set *c, struct btree *b,
/* Btree IO */
+/*
+ * We seed the checksum with the entire first pointer (dev, gen and offset),
+ * since for btree nodes we have to store the checksum with the data instead of
+ * the pointer - this helps guard against reading a valid btree node that is not
+ * the node we actually wanted:
+ */
#define btree_csum_set(_b, _i) \
({ \
void *_data = (void *) (_i) + 8; \
@@ -573,7 +579,7 @@ static void bch_btree_node_read(struct cache_set *c, struct btree *b)
bch_bio_map(bio, b->data);
bio_get(bio);
- bch_submit_bbio(to_bbio(bio), pick.ca, &b->key, &pick.ptr, true);
+ bch_submit_bbio(to_bbio(bio), pick.ca, &pick.ptr, true);
closure_sync(&cl);
@@ -2351,8 +2357,7 @@ struct btree_split_state {
* pointers never have crc/compression info, so we only need to acount
* for the pointers for three keys
*/
- u64 inline_keys[(BKEY_U64s +
- BKEY_EXTENT_PTRS_MAX) * 3];
+ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
struct btree_reserve *reserve;
};
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index ec467b1b56d6..835d4bcf166b 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -101,7 +101,7 @@ struct btree {
struct rhash_head hash;
/* Key/pointer for this btree node */
- BKEY_PADDED(key);
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
/* Single bit - set when accessed, cleared by shrinker */
unsigned long accessed;
diff --git a/drivers/md/bcache/buckets.c b/drivers/md/bcache/buckets.c
index 3d92c5db96d3..3c96ce502c10 100644
--- a/drivers/md/bcache/buckets.c
+++ b/drivers/md/bcache/buckets.c
@@ -297,7 +297,7 @@ int bch_mark_pointers(struct cache_set *c, struct btree *b,
struct bkey_s_c_extent e, int sectors,
bool fail_if_stale, bool metadata)
{
- const struct bch_extent_ptr *ptr;
+ const struct bch_extent_ptr *ptr, *ptr2;
struct cache *ca;
BUG_ON(metadata && bkey_extent_is_cached(e.k));
@@ -350,11 +350,14 @@ int bch_mark_pointers(struct cache_set *c, struct btree *b,
return 0;
stale:
- while (--ptr >= e.v->ptr)
- if ((ca = PTR_CACHE(c, ptr)))
- bch_mark_bucket(c, ca, b, ptr, -sectors,
- bch_extent_ptr_is_dirty(c, e, ptr),
- metadata);
+ extent_for_each_online_device(c, e, ptr2, ca) {
+ if (ptr2 == ptr)
+ break;
+
+ bch_mark_bucket(c, ca, b, ptr, -sectors,
+ bch_extent_ptr_is_dirty(c, e, ptr),
+ metadata);
+ }
rcu_read_unlock();
return -1;
diff --git a/drivers/md/bcache/buckets.h b/drivers/md/bcache/buckets.h
index a2b8e479f80c..5e191ef91812 100644
--- a/drivers/md/bcache/buckets.h
+++ b/drivers/md/bcache/buckets.h
@@ -48,19 +48,20 @@ static inline size_t PTR_BUCKET_NR_TRACE(const struct cache_set *c,
const struct bkey_i *k,
unsigned ptr)
{
- const struct cache *ca;
size_t bucket = 0;
-
+#if 0
if (bkey_extent_is_data(&k->k)) {
- const struct bkey_i_extent *e = bkey_i_to_extent_c(k);
- const struct bch_extent_ptr *p = &e->v.ptr[ptr];
+ const struct bch_extent_ptr *ptr;
+ const struct cache *ca;
rcu_read_lock();
- if ((ca = PTR_CACHE(c, p)))
- bucket = PTR_BUCKET_NR(ca, p);
+ extent_for_each_online_device(c, bkey_i_to_s_c_extent(k), ptr, ca) {
+ bucket = PTR_BUCKET_NR(ca, ptr);
+ break;
+ }
rcu_read_unlock();
}
-
+#endif
return bucket;
}
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index bb4d715c9b15..99d4657c4f4a 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -74,7 +74,7 @@ void bch_btree_verify(struct cache_set *c, struct btree *b)
bio->bi_end_io = btree_verify_endio;
bch_bio_map(bio, n_sorted);
- bch_submit_bbio(to_bbio(bio), pick.ca, &b->key, &pick.ptr, true);
+ bch_submit_bbio(to_bbio(bio), pick.ca, &pick.ptr, true);
closure_sync(&cl);
bio_put(bio);
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 140b7a9fed3f..4d5889d6d107 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -162,42 +162,104 @@ bool bch_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
return false;
}
-static bool should_drop_ptr(const struct cache_set *c,
- struct bkey_s_c_extent e,
- const struct bch_extent_ptr *ptr)
+/* returns true if equal */
+static bool crc_cmp(union bch_extent_entry *l, union bch_extent_entry *r)
{
- struct cache *ca;
- struct cache_member *mi;
+ return extent_entry_type(l) == extent_entry_type(r) &&
+ !memcmp(l, r, extent_entry_bytes(l));
+}
- if (ptr->dev == PTR_LOST_DEV)
- return false;
+/* Increment pointers after @crc by crc's offset until the next crc entry: */
+void extent_adjust_pointers(struct bkey_s_extent e, union bch_extent_entry *crc)
+{
+ union bch_extent_entry *entry;
+ unsigned offset = crc_to_64((void *) crc).offset;
- if (ptr->dev >= c->sb.nr_in_set)
- return true;
+ extent_for_each_entry_from(e, entry, extent_entry_next(crc)) {
+ if (!extent_entry_is_ptr(entry))
+ return;
- mi = rcu_dereference(c->members)->m;
+ entry->ptr.offset += offset;
+ }
+}
- if (bch_is_zero(mi[ptr->dev].uuid.b, sizeof(uuid_le)))
- return true;
+static void extent_cleanup_crcs(struct bkey_s_extent e)
+{
+ union bch_extent_entry *crc = e.v->start, *prev = NULL;
- if (bch_extent_ptr_is_dirty(c, e, ptr))
- return false;
+ while (crc != extent_entry_last(e)) {
+ union bch_extent_entry *next = extent_entry_next(crc);
+ size_t crc_u64s = extent_entry_u64s(crc);
- return (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr);
+ if (!extent_entry_is_crc(crc))
+ goto next;
+
+ if (next != extent_entry_last(e) &&
+ extent_entry_is_crc(next)) {
+ /*
+ * Two crc entries right after the other, the first one
+ * doesn't have any pointers and we can just drop it:
+ */
+ goto drop;
+ }
+
+ if (prev && crc_cmp(crc, prev)) {
+ /*
+ * This crc entry is identical to the previous one, drop
+ * it:
+ */
+ goto drop;
+ }
+
+ if (!prev &&
+ !crc_to_64((void *) crc).csum_type &&
+ !crc_to_64((void *) crc).compression_type){
+ extent_adjust_pointers(e, crc);
+ goto drop;
+ }
+
+ prev = crc;
+next:
+ crc = next;
+ continue;
+drop:
+ memmove(crc, next,
+ (void *) extent_entry_last(e) - (void *) next);
+ e.k->u64s -= crc_u64s;
+ }
}
-void bch_extent_drop_stale(struct cache_set *c, struct bkey_s k)
+void bch_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
{
- struct bkey_s_extent e = bkey_s_to_extent(k);
- struct bch_extent_ptr *ptr;
+ __bch_extent_drop_ptr(e, ptr);
+ extent_cleanup_crcs(e);
+}
- rcu_read_lock();
+static bool should_drop_ptr(const struct cache_set *c,
+ struct bkey_s_c_extent e,
+ const struct bch_extent_ptr *ptr)
+{
+ struct cache *ca;
+
+ return (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr);
+}
- extent_for_each_ptr_backwards(e, ptr)
- if (should_drop_ptr(c, e.c, ptr))
- bch_extent_drop_ptr(e, ptr);
+void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent e)
+{
+ struct bch_extent_ptr *ptr = &e.v->start->ptr;
+ bool dropped = false;
+ rcu_read_lock();
+ while ((ptr = extent_ptr_next(e, ptr)))
+ if (should_drop_ptr(c, e.c, ptr)) {
+ __bch_extent_drop_ptr(e, ptr);
+ dropped = true;
+ } else
+ ptr++;
rcu_read_unlock();
+
+ if (dropped)
+ extent_cleanup_crcs(e);
}
static bool bch_ptr_normalize(struct btree_keys *bk, struct bkey_s k)
@@ -207,145 +269,123 @@ static bool bch_ptr_normalize(struct btree_keys *bk, struct bkey_s k)
return bch_extent_normalize(b->c, k);
}
-/*
- * Common among btree pointers and normal data extents
- */
-static bool __ptr_invalid(const struct cache_set *c, struct bkey_s_c k)
+static const char *extent_ptr_invalid(const struct cache_member_rcu *mi,
+ const struct bch_extent_ptr *ptr,
+ unsigned size_ondisk)
{
- struct bkey_s_c_extent e;
- const struct bch_extent_ptr *ptr;
- struct cache_member *mi;
- bool ret = true;
+ const struct cache_member *m = mi->m + ptr->dev;
- if (k.k->u64s < BKEY_U64s)
- return true;
+ if (ptr->dev == PTR_LOST_DEV) /* XXX: kill */
+ return NULL;
- switch (k.k->type) {
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- e = bkey_s_c_to_extent(k);
-
- if (bch_extent_ptrs(e) > BKEY_EXTENT_PTRS_MAX)
- return true;
+ if (ptr->dev > mi->nr_in_set ||
+ bch_is_zero(m->uuid.b, sizeof(uuid_le)))
+ return "pointer to invalid device";
- mi = cache_member_info_get(c)->m;
+ if (ptr->offset + size_ondisk > m->bucket_size * m->nbuckets)
+ return "offset past end of device";
- extent_for_each_ptr(e, ptr) {
- struct cache_member *m = mi + ptr->dev;
-
- if (ptr->dev > c->sb.nr_in_set) {
- if (ptr->dev != PTR_LOST_DEV)
- goto invalid;
-
- continue;
- }
+ if (ptr->offset < m->bucket_size * m->first_bucket)
+ return "offset before first bucket";
- if ((ptr->offset + e.k->size >
- m->bucket_size * m->nbuckets) ||
- (ptr->offset <
- m->bucket_size * m->first_bucket) ||
- ((ptr->offset & (m->bucket_size - 1)) + e.k->size >
- m->bucket_size))
- goto invalid;
- }
-
- ret = false;
-invalid:
- cache_member_info_put();
- break;
- default:
- return true;
- }
+ if ((ptr->offset & (m->bucket_size - 1)) + size_ondisk > m->bucket_size)
+ return "spans multiple buckets";
- return ret;
+ return NULL;
}
-/*
- * Should match __extent_invalid() - returns the reason an extent is invalid
- */
-static const char *bch_ptr_status(const struct cache_set *c,
- struct cache_member *mi,
- struct bkey_s_c_extent e)
+static size_t extent_print_ptrs(struct cache_set *c, char *buf,
+ size_t size, struct bkey_s_c_extent e)
{
+ char *out = buf, *end = buf + size;
+ const union bch_extent_entry *entry;
const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc64 crc;
+ struct cache *ca;
+ bool first = true;
- if (!bch_extent_ptrs(e))
- return "invalid: no pointers";
-
- if (bch_extent_ptrs(e) > BKEY_EXTENT_PTRS_MAX)
- return "invalid: too many pointers";
-
- extent_for_each_ptr(e, ptr) {
- struct cache_member *m = mi + ptr->dev;
- struct cache *ca;
-
- if (ptr->dev > c->sb.nr_in_set) {
- if (ptr->dev != PTR_LOST_DEV)
- return "pointer to invalid device";
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
- continue;
+ rcu_read_lock();
+ extent_for_each_entry(e, entry) {
+ if (!first)
+ p(" ");
+
+ switch (extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_crc32:
+ case BCH_EXTENT_ENTRY_crc64:
+ crc = crc_to_64((void *) entry);
+ p("crc: c_size %u size %u offset %u csum %u compress %u",
+ crc.compressed_size, crc.uncompressed_size,
+ crc.offset, crc.csum_type, crc.compression_type);
+ break;
+ case BCH_EXTENT_ENTRY_ptr:
+ ptr = &entry->ptr;
+ p("ptr: %u:%llu gen %u%s", ptr->dev,
+ (u64) ptr->offset, ptr->gen,
+ (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr)
+ ? " stale" : "");
+ break;
}
- if (ptr->offset + e.k->size > m->bucket_size * m->nbuckets)
- return "invalid: offset past end of device";
-
- if (ptr->offset < m->bucket_size * m->first_bucket)
- return "invalid: offset before first bucket";
-
- if ((ptr->offset & (m->bucket_size - 1)) +
- e.k->size > m->bucket_size)
- return "invalid: spans multiple buckets";
-
- if ((ca = PTR_CACHE(c, ptr)) &&
- ptr_stale(ca, ptr))
- return "stale";
+ first = false;
}
+ rcu_read_unlock();
- if (!e.k->size)
- return "zeroed key";
- return "";
+ if (bkey_extent_is_cached(e.k))
+ p(" cached");
+#undef p
+ return out - buf;
}
-static void bch_extent_to_text(struct cache_set *c, char *buf,
- size_t size, struct bkey_s_c k)
+/* Btree ptrs */
+
+static const char *bch_btree_ptr_invalid_reason(const struct cache_set *c,
+ struct bkey_s_c k)
{
- struct bkey_s_c_extent e;
- char *out = buf, *end = buf + size;
- const struct bch_extent_ptr *ptr;
+ if (bkey_extent_is_cached(k.k))
+ return "cached";
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+ if (k.k->size)
+ return "nonzero key size";
- if (bkey_extent_is_data(k.k)) {
- e = bkey_s_c_to_extent(k);
+ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+ return "value too big";
- extent_for_each_ptr(e, ptr) {
- if (ptr != e.v->ptr)
- p(", ");
+ switch (k.k->type) {
+ case BCH_EXTENT: {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const struct bch_extent_ptr *ptr;
+ const union bch_extent_crc *crc;
+ struct cache_member_rcu *mi = cache_member_info_get(c);
+ const char *reason;
- p("%u:%llu gen %u", ptr->dev,
- (u64) ptr->offset, ptr->gen);
- }
+ extent_for_each_ptr_crc(e, ptr, crc) {
+ reason = extent_ptr_invalid(mi, ptr,
+ CACHE_BTREE_NODE_SIZE(&c->sb));
- if (bkey_extent_is_cached(e.k))
- p(" cached");
-#if 0
- if (KEY_CSUM(k))
- p(" cs%llu %llx", KEY_CSUM(k), k->val[1]);
-#endif
+ if (reason) {
+ cache_member_info_put();
+ return reason;
+ }
+ }
- p(" %s", bch_ptr_status(c, cache_member_info_get(c)->m, e));
cache_member_info_put();
+
+ if (crc)
+ return "has crc field";
+
+ return NULL;
}
-#undef p
-}
-/* Btree ptrs */
+ default:
+ return "invalid value type";
+ }
+}
static bool bch_btree_ptr_invalid(const struct cache_set *c, struct bkey_s_c k)
{
- return bkey_extent_is_cached(k.k) ||
- k.k->size ||
- __ptr_invalid(c, k);
+ return bch_btree_ptr_invalid_reason(c, k);
}
static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b,
@@ -358,41 +398,49 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b,
char buf[160];
struct bucket *g;
struct cache *ca;
+ unsigned replicas = 0;
bool bad;
- if (bch_extent_ptrs(e) < CACHE_SET_META_REPLICAS_HAVE(&c->sb)) {
- bch_bkey_val_to_text(c, btree_node_type(b),
- buf, sizeof(buf), k);
- cache_set_bug(c,
- "btree key bad (too few replicas, %u < %llu): %s",
- bch_extent_ptrs(e),
- CACHE_SET_META_REPLICAS_HAVE(&c->sb),
- buf);
+ if (bkey_extent_is_cached(k.k)) {
+ btree_bug(b, "btree ptr marked as cached");
return;
}
rcu_read_lock();
extent_for_each_online_device(c, e, ptr, ca) {
- g = PTR_BUCKET(ca, ptr);
+ replicas++;
- err = "stale";
- if (ptr_stale(ca, ptr))
- goto err;
+ if ((ca = PTR_CACHE(c, ptr))) {
+ g = PTR_BUCKET(ca, ptr);
- do {
- seq = read_seqcount_begin(&c->gc_cur_lock);
- bad = (!__gc_will_visit_node(c, b) &&
- !g->mark.is_metadata);
- } while (read_seqcount_retry(&c->gc_cur_lock, seq));
+ err = "stale";
+ if (ptr_stale(ca, ptr))
+ goto err;
- err = "inconsistent";
- if (bad)
- goto err;
+ do {
+ seq = read_seqcount_begin(&c->gc_cur_lock);
+ bad = (!__gc_will_visit_node(c, b) &&
+ !g->mark.is_metadata);
+ } while (read_seqcount_retry(&c->gc_cur_lock, seq));
+
+ err = "inconsistent";
+ if (bad)
+ goto err;
+ }
}
rcu_read_unlock();
+ if (replicas < CACHE_SET_META_REPLICAS_HAVE(&c->sb)) {
+ bch_bkey_val_to_text(c, btree_node_type(b),
+ buf, sizeof(buf), k);
+ cache_set_bug(c,
+ "btree key bad (too few replicas, %u < %llu): %s",
+ replicas, CACHE_SET_META_REPLICAS_HAVE(&c->sb), buf);
+ return;
+ }
+
return;
err:
bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
@@ -404,16 +452,43 @@ err:
rcu_read_unlock();
}
+static void bch_btree_ptr_to_text(struct cache_set *c, char *buf,
+ size_t size, struct bkey_s_c k)
+{
+ char *out = buf, *end = buf + size;
+ const char *invalid;
+
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+ if (bkey_extent_is_data(k.k))
+ out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+
+ invalid = bch_btree_ptr_invalid_reason(c, k);
+ if (invalid)
+ p(" invalid: %s", invalid);
+#undef p
+}
+
struct extent_pick_ptr
bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
{
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
+ union bch_extent_crc *crc;
const struct bch_extent_ptr *ptr;
struct cache *ca;
rcu_read_lock();
- extent_for_each_online_device(c, e, ptr, ca) {
+ extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+ if (crc) {
+ bch_cache_error(ca,
+ "btree node pointer with crc at btree %u level %u/%u bucket %zu",
+ b->btree_id, b->level, btree_node_root(b)
+ ? btree_node_root(b)->level : -1,
+ PTR_BUCKET_NR(ca, ptr));
+ break;
+ }
+
if (ptr_stale(ca, ptr)) {
bch_cache_error(ca,
"stale btree node pointer at btree %u level %u/%u bucket %zu",
@@ -440,7 +515,7 @@ const struct btree_keys_ops bch_btree_interior_node_ops = {
const struct bkey_ops bch_bkey_btree_ops = {
.key_invalid = bch_btree_ptr_invalid,
.key_debugcheck = btree_ptr_debugcheck,
- .val_to_text = bch_extent_to_text,
+ .val_to_text = bch_btree_ptr_to_text,
};
/* Extents */
@@ -467,9 +542,24 @@ bool __bch_cut_front(struct bpos where, struct bkey_s k)
else if (bkey_extent_is_data(k.k)) {
struct bkey_s_extent e = bkey_s_to_extent(k);
struct bch_extent_ptr *ptr;
-
- extent_for_each_ptr(e, ptr)
- ptr->offset += e.k->size - len;
+ union bch_extent_crc *crc, *prev_crc = NULL;
+
+ extent_for_each_ptr_crc(e, ptr, crc) {
+ switch (bch_extent_crc_type(crc)) {
+ case BCH_EXTENT_CRC_NONE:
+ ptr->offset += e.k->size - len;
+ break;
+ case BCH_EXTENT_CRC32:
+ if (prev_crc != crc)
+ crc->crc32.offset += e.k->size - len;
+ break;
+ case BCH_EXTENT_CRC64:
+ if (prev_crc != crc)
+ crc->crc64.offset += e.k->size - len;
+ break;
+ }
+ prev_crc = crc;
+ }
}
k.k->size = len;
@@ -765,8 +855,8 @@ static void bch_drop_subtract(struct cache_set *c, struct btree *b,
static bool bkey_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r)
{
struct bkey_s_c_extent le, re;
+ const struct bch_extent_ptr *lp, *rp;
s64 offset;
- unsigned i;
BUG_ON(!l.k->size || !r.k->size);
@@ -803,12 +893,17 @@ static bool bkey_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r)
* pointer
*/
- if (bch_extent_ptrs(le) != bch_extent_ptrs(re))
+ if (bkey_val_u64s(le.k) != bkey_val_u64s(re.k))
return false;
- for (i = 0; i < bch_extent_ptrs(le); i++) {
- const struct bch_extent_ptr *lp = le.v->ptr + i;
- const struct bch_extent_ptr *rp = re.v->ptr + i;
+ extent_for_each_ptr(le, lp) {
+ const union bch_extent_entry *entry =
+ bkey_idx(re.v, (u64 *) lp - le.v->_data);
+
+ if (!extent_entry_is_ptr(entry))
+ return false;
+
+ rp = &entry->ptr;
if (lp->offset != rp->offset + offset ||
lp->dev != rp->dev ||
@@ -1211,10 +1306,63 @@ out:
return inserted;
}
+static const char *bch_extent_invalid_reason(const struct cache_set *c,
+ struct bkey_s_c k)
+{
+ if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+ return "value too big";
+
+ if (!k.k->size)
+ return "zero key size";
+
+ switch (k.k->type) {
+ case BCH_EXTENT:
+ case BCH_EXTENT_CACHED: {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const union bch_extent_entry *entry;
+ struct bch_extent_crc64 crc64;
+ struct cache_member_rcu *mi = cache_member_info_get(c);
+ unsigned size_ondisk = e.k->size;
+ const char *reason;
+
+ extent_for_each_entry(e, entry)
+ switch (extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_crc32:
+ case BCH_EXTENT_ENTRY_crc64:
+ crc64 = crc_to_64((void *) entry);
+
+ reason = "checksum uncompressed size < key size";
+ if (crc64.uncompressed_size < e.k->size)
+ goto invalid;
+
+ reason = "checksum offset > uncompressed size";
+ if (crc64.offset >= crc64.uncompressed_size)
+ goto invalid;
+
+ size_ondisk = crc64.compressed_size;
+ break;
+ case BCH_EXTENT_ENTRY_ptr:
+ reason = extent_ptr_invalid(mi, &entry->ptr, size_ondisk);
+ if (reason)
+ goto invalid;
+ break;
+ }
+
+ cache_member_info_put();
+ return NULL;
+invalid:
+ cache_member_info_put();
+ return reason;
+ }
+
+ default:
+ return "invalid value type";
+ }
+}
+
static bool bch_extent_invalid(const struct cache_set *c, struct bkey_s_c k)
{
- return (bkey_extent_is_data(k.k) && !k.k->size) ||
- __ptr_invalid(c, k);
+ return bch_extent_invalid_reason(c, k);
}
static void bch_extent_debugcheck(struct cache_set *c, struct btree *b,
@@ -1229,27 +1377,17 @@ static void bch_extent_debugcheck(struct cache_set *c, struct btree *b,
char buf[160];
bool bad;
unsigned ptrs_per_tier[CACHE_TIERS];
- unsigned i, tier, replicas;
+ unsigned i, tier, replicas = 0;
memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
- if (!bkey_extent_is_cached(e.k) &&
- bch_extent_ptrs(e) < CACHE_SET_DATA_REPLICAS_HAVE(&c->sb)) {
- bch_bkey_val_to_text(c, btree_node_type(b),
- buf, sizeof(buf), k);
- cache_set_bug(c,
- "extent key bad (too few replicas, %u < %llu): %s",
- bch_extent_ptrs(e),
- CACHE_SET_DATA_REPLICAS_HAVE(&c->sb),
- buf);
- return;
- }
-
mi = cache_member_info_get(c);
extent_for_each_ptr(e, ptr) {
bool dirty = bch_extent_ptr_is_dirty(c, e, ptr);
+ replicas++;
+
/* Could be a special pointer such as PTR_CHECK_DEV */
if (ptr->dev >= mi->nr_in_set) {
if (ptr->dev != PTR_LOST_DEV)
@@ -1299,55 +1437,165 @@ static void bch_extent_debugcheck(struct cache_set *c, struct btree *b,
goto bad_ptr;
}
}
+ cache_member_info_put();
+
+ if (!bkey_extent_is_cached(e.k) &&
+ replicas < CACHE_SET_DATA_REPLICAS_HAVE(&c->sb)) {
+ bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
+ cache_set_bug(c,
+ "extent key bad (too few replicas, %u < %llu): %s",
+ replicas, CACHE_SET_DATA_REPLICAS_HAVE(&c->sb), buf);
+ return;
+ }
- replicas = CACHE_SET_DATA_REPLICAS_WANT(&c->sb);
+ /*
+ * XXX: _why_ was this added?
+ */
for (i = 0; i < CACHE_TIERS; i++)
- if (ptrs_per_tier[i] > replicas) {
- bch_bkey_val_to_text(c, btree_node_type(b),
- buf, sizeof(buf), k);
+ if (ptrs_per_tier[i] > CACHE_SET_DATA_REPLICAS_WANT(&c->sb)) {
+ bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
cache_set_bug(c,
"extent key bad (too many tier %u replicas): %s",
i, buf);
break;
}
- cache_member_info_put();
return;
bad_device:
bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
- cache_set_bug(c, "extent pointer %u device missing: %s",
- (unsigned) (ptr - e.v->ptr), buf);
+ cache_set_bug(c, "extent pointer to dev %u missing device: %s",
+ ptr->dev, buf);
cache_member_info_put();
return;
bad_ptr:
bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
- cache_set_bug(c, "extent pointer %u bad gc mark: %s:\nbucket %zu prio %i "
+ cache_set_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i "
"gen %i last_gc %i mark 0x%08x",
- (unsigned) (ptr - e.v->ptr), buf, PTR_BUCKET_NR(ca, ptr),
+ buf, PTR_BUCKET_NR(ca, ptr),
g->read_prio, PTR_BUCKET_GEN(ca, ptr),
g->oldest_gen, g->mark.counter);
cache_member_info_put();
return;
}
+static void bch_extent_to_text(struct cache_set *c, char *buf,
+ size_t size, struct bkey_s_c k)
+{
+ char *out = buf, *end = buf + size;
+ const char *invalid;
+
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+ if (bkey_extent_is_data(k.k))
+ out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+
+ invalid = bch_extent_invalid_reason(c, k);
+ if (invalid)
+ p(" invalid: %s", invalid);
+#undef p
+}
+
static unsigned PTR_TIER(struct cache_member_rcu *mi,
- const struct bch_extent *e,
- unsigned ptr)
+ const struct bch_extent_ptr *ptr)
+{
+ return ptr->dev < mi->nr_in_set
+ ? CACHE_TIER(&mi->m[ptr->dev])
+ : UINT_MAX;
+}
+
+static void __extent_sort_ptrs(struct cache_member_rcu *mi,
+ struct bkey_s_extent src)
+{
+ struct bch_extent_ptr *src_ptr, *dst_ptr;
+ union bch_extent_entry *src_crc, *dst_crc;
+ BKEY_PADDED(k) tmp;
+ struct bkey_s_extent dst;
+ size_t u64s, crc_u64s;
+ u64 *p;
+
+ /*
+ * Insertion sort:
+ *
+ * Note: this sort needs to be stable, because pointer order determines
+ * pointer dirtyness.
+ */
+
+ tmp.k.k = *src.k;
+ dst = bkey_i_to_s_extent(&tmp.k);
+ set_bkey_val_u64s(dst.k, 0);
+
+ extent_for_each_ptr_crc(src, src_ptr, src_crc) {
+ extent_for_each_ptr_crc(dst, dst_ptr, dst_crc)
+ if (PTR_TIER(mi, src_ptr) < PTR_TIER(mi, dst_ptr))
+ break;
+
+ /* found insert position: */
+
+ /*
+ * we're making sure everything has a crc at this point, if
+ * dst_ptr points to a pointer it better have a crc:
+ */
+ BUG_ON(dst_ptr != &extent_entry_last(dst)->ptr && !dst_crc);
+ BUG_ON(dst_crc && extent_entry_next(dst_crc) != (void *) dst_ptr);
+
+ p = dst_ptr != &extent_entry_last(dst)->ptr
+ ? (void *) dst_crc
+ : (void *) dst_ptr;
+
+ if (!src_crc)
+ src_crc = (void *) &((struct bch_extent_crc32) {
+ .type = 1 << BCH_EXTENT_ENTRY_crc32,
+ .compressed_size = src.k->size,
+ .uncompressed_size = src.k->size,
+ .offset = 0,
+ .compression_type = BCH_COMPRESSION_NONE,
+ .csum_type = BCH_CSUM_NONE,
+ .csum = 0,
+ });
+
+ crc_u64s = extent_entry_u64s((void *) src_crc);
+ u64s = crc_u64s + sizeof(*dst_ptr) / sizeof(u64);
+
+ memmove(p + u64s, p,
+ (void *) extent_entry_last(dst) - (void *) p);
+ set_bkey_val_u64s(dst.k, bkey_val_u64s(dst.k) + u64s);
+
+ memcpy(p, src_crc, crc_u64s * sizeof(u64));
+ memcpy(p + crc_u64s, src_ptr, sizeof(*src_ptr));
+ }
+
+ /* Sort done - now drop redundant crc entries: */
+ extent_cleanup_crcs(dst);
+
+ memcpy(src.v, dst.v, bkey_val_bytes(dst.k));
+ set_bkey_val_u64s(src.k, bkey_val_u64s(dst.k));
+}
+
+static void extent_sort_ptrs(struct cache_set *c, struct bkey_s_extent e)
{
- unsigned dev = e->ptr[ptr].dev;
+ struct cache_member_rcu *mi;
+ struct bch_extent_ptr *ptr, *prev = NULL;
+ union bch_extent_crc *crc;
+
+ mi = cache_member_info_get(c);
+
+ extent_for_each_ptr_crc(e, ptr, crc)
+ if (prev &&
+ PTR_TIER(mi, ptr) < PTR_TIER(mi, prev)) {
+ __extent_sort_ptrs(mi, e);
+ break;
+ }
- return dev < mi->nr_in_set ? CACHE_TIER(&mi->m[dev]) : UINT_MAX;
+ cache_member_info_put();
}
bool bch_extent_normalize(struct cache_set *c, struct bkey_s k)
{
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
- struct cache_member_rcu *mi;
- unsigned i;
- bool swapped, have_data = false;
+ bool have_data = false;
switch (k.k->type) {
case KEY_TYPE_ERROR:
@@ -1364,31 +1612,15 @@ bool bch_extent_normalize(struct cache_set *c, struct bkey_s k)
case BCH_EXTENT_CACHED:
e = bkey_s_to_extent(k);
-
- bch_extent_drop_stale(c, k);
-
- mi = cache_member_info_get(c);
-
- /* Bubble sort pointers by tier, lowest (fastest) tier first */
- do {
- swapped = false;
- for (i = 0; i + 1 < bch_extent_ptrs(e); i++) {
- if (PTR_TIER(mi, e.v, i) >
- PTR_TIER(mi, e.v, i + 1)) {
- swap(e.v->ptr[i], e.v->ptr[i + 1]);
- swapped = true;
- }
- }
- } while (swapped);
-
- cache_member_info_put();
+ bch_extent_drop_stale(c, e);
+ extent_sort_ptrs(c, e);
extent_for_each_ptr(e, ptr)
if (ptr->dev != PTR_LOST_DEV)
have_data = true;
if (!have_data) {
- bch_set_extent_ptrs(e, 0);
+ set_bkey_val_u64s(e.k, 0);
if (bkey_extent_is_cached(e.k)) {
k.k->type = KEY_TYPE_DISCARD;
if (!k.k->version)
@@ -1417,6 +1649,7 @@ bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
struct cache *avoid)
{
struct bkey_s_c_extent e;
+ const union bch_extent_crc *crc;
const struct bch_extent_ptr *ptr;
struct cache *ca;
struct extent_pick_ptr ret = { .ca = NULL };
@@ -1439,9 +1672,10 @@ bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
e = bkey_s_c_to_extent(k);
rcu_read_lock();
- extent_for_each_online_device(c, e, ptr, ca)
+ extent_for_each_online_device_crc(c, e, crc, ptr, ca)
if (!ptr_stale(ca, ptr)) {
ret = (struct extent_pick_ptr) {
+ .crc = crc_to_64(crc),
.ptr = *ptr,
.ca = ca,
};
@@ -1469,7 +1703,7 @@ static enum merge_result bch_extent_merge(struct btree_keys *bk,
struct btree *b = container_of(bk, struct btree, keys);
struct cache_set *c = b->c;
struct bkey_s_extent el, er;
- unsigned i;
+ union bch_extent_entry *en_l, *en_r;
if (key_merging_disabled(c))
return BCH_MERGE_NOMERGE;
@@ -1498,11 +1732,20 @@ static enum merge_result bch_extent_merge(struct btree_keys *bk,
el = bkey_i_to_s_extent(l);
er = bkey_i_to_s_extent(r);
- for (i = 0; i < bch_extent_ptrs(el); i++) {
- struct bch_extent_ptr *lp = el.v->ptr + i;
- struct bch_extent_ptr *rp = er.v->ptr + i;
+ extent_for_each_entry(el, en_l) {
+ struct bch_extent_ptr *lp, *rp;
struct cache_member *m;
+ en_r = bkey_idx(er.v, (u64 *) en_l - el.v->_data);
+
+ if ((extent_entry_type(en_l) !=
+ extent_entry_type(en_r)) ||
+ extent_entry_is_crc(en_l))
+ return BCH_MERGE_NOMERGE;
+
+ lp = &en_l->ptr;
+ rp = &en_r->ptr;
+
if (lp->offset + el.k->size != rp->offset ||
lp->dev != rp->dev ||
lp->gen != rp->gen)
@@ -1533,14 +1776,7 @@ static enum merge_result bch_extent_merge(struct btree_keys *bk,
bch_cut_front(l->k.p, r);
return BCH_MERGE_PARTIAL;
}
-#if 0
- if (KEY_CSUM(l)) {
- if (KEY_CSUM(r))
- l->val[bch_extent_ptrs(l)] = merge_chksums(l, r);
- else
- SET_KEY_CSUM(l, 0);
- }
-#endif
+
bch_key_resize(&l->k, l->k.size + r->k.size);
return BCH_MERGE_MERGE;
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
index ad6bcdf185ad..08c039bd0869 100644
--- a/drivers/md/bcache/extents.h
+++ b/drivers/md/bcache/extents.h
@@ -3,7 +3,10 @@
#include "bkey.h"
+#include <linux/bcache.h>
+
struct bch_replace_info;
+union bch_extent_crc;
struct btree_nr_keys bch_key_sort_fix_overlapping(struct btree_keys *,
struct bset *,
@@ -31,6 +34,7 @@ struct cache_set;
struct journal_res;
struct extent_pick_ptr {
+ struct bch_extent_crc64 crc;
struct bch_extent_ptr ptr;
struct cache *ca;
};
@@ -53,7 +57,7 @@ bool bch_insert_fixup_extent(struct cache_set *, struct btree *,
struct bch_replace_info *, struct bpos *,
struct journal_res *, unsigned);
-void bch_extent_drop_stale(struct cache_set *c, struct bkey_s);
+void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent);
bool bch_extent_normalize(struct cache_set *, struct bkey_s);
static inline bool bkey_extent_is_data(const struct bkey *k)
@@ -80,69 +84,269 @@ static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
}
-#define bch_extent_ptrs(_e) bkey_val_u64s((_e).k)
+static inline enum bch_extent_entry_type
+extent_entry_type(const union bch_extent_entry *e)
+{
+ int ret = __ffs(e->type);
+
+ EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
-static inline void bch_set_extent_ptrs(struct bkey_s_extent e, unsigned i)
+ return ret;
+}
+
+static inline size_t __extent_entry_bytes(enum bch_extent_entry_type type)
{
- BUG_ON(i > BKEY_EXTENT_PTRS_MAX);
- set_bkey_val_u64s(e.k, i);
+ switch (type) {
+ case BCH_EXTENT_ENTRY_crc32:
+ return sizeof(struct bch_extent_crc32);
+ case BCH_EXTENT_ENTRY_crc64:
+ return sizeof(struct bch_extent_crc64);
+ case BCH_EXTENT_ENTRY_ptr:
+ return sizeof(struct bch_extent_ptr);
+ default:
+ BUG();
+ }
}
-static inline void bch_extent_drop_ptr(struct bkey_s_extent e,
- struct bch_extent_ptr *ptr)
+static inline size_t __extent_entry_u64s(enum bch_extent_entry_type type)
{
- BUG_ON(ptr < e.v->ptr ||
- ptr >= e.v->ptr + bch_extent_ptrs(e.c));
+ return __extent_entry_bytes(type) / sizeof(u64);
+}
- memmove(ptr, ptr + 1,
- (void *) (e.v->ptr + bch_extent_ptrs(e.c)) -
- (void *) (ptr + 1));
- e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+static inline size_t extent_entry_bytes(const union bch_extent_entry *e)
+{
+ return __extent_entry_bytes(extent_entry_type(e));
}
-static inline bool bch_extent_ptr_is_dirty(const struct cache_set *c,
- struct bkey_s_c_extent e,
- const struct bch_extent_ptr *ptr)
+static inline size_t extent_entry_u64s(const union bch_extent_entry *e)
{
- /* Dirty pointers come last */
+ return extent_entry_bytes(e) / sizeof(u64);
+}
- if (bkey_extent_is_cached(e.k))
- return false;
+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
+{
+ return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
- return ptr + CACHE_SET_DATA_REPLICAS_WANT(&c->sb) >=
- e.v->ptr + bch_extent_ptrs(e);
+static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
+{
+ return !extent_entry_is_ptr(e);
}
-#define extent_for_each_ptr(_extent, _ptr) \
- for ((_ptr) = (_extent).v->ptr; \
- (_ptr) < (_extent).v->ptr + bch_extent_ptrs(_extent); \
- (_ptr)++)
+union bch_extent_crc {
+ u8 type;
+ struct bch_extent_crc32 crc32;
+ struct bch_extent_crc64 crc64;
+};
-/*
- * Use this when you'll be dropping pointers as you iterate.
- * Any reason we shouldn't just always do this?
- */
-#define extent_for_each_ptr_backwards(_extent, _ptr) \
- for ((_ptr) = (_extent).v->ptr + bch_extent_ptrs(_extent) - 1; \
- (_ptr) >= (_extent).v->ptr; \
- --(_ptr))
+enum bch_extent_crc_type {
+ BCH_EXTENT_CRC_NONE,
+ BCH_EXTENT_CRC32,
+ BCH_EXTENT_CRC64,
+};
+
+static inline enum bch_extent_crc_type
+bch_extent_crc_type(const union bch_extent_crc *crc)
+{
+ if (!crc)
+ return BCH_EXTENT_CRC_NONE;
+
+ switch (extent_entry_type((void *) crc)) {
+ case BCH_EXTENT_ENTRY_crc32:
+ return BCH_EXTENT_CRC32;
+ case BCH_EXTENT_ENTRY_crc64:
+ return BCH_EXTENT_CRC64;
+ default:
+ BUG();
+ }
+}
+
+#define extent_entry_next(_entry) \
+ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
+#define extent_entry_last(_e) \
+ bkey_idx((_e).v, bkey_val_u64s((_e).k))
+
+#define extent_for_each_entry_from(_e, _entry, _start) \
+ for ((_entry) = _start; \
+ (_entry) < extent_entry_last(_e); \
+ (_entry) = extent_entry_next(_entry))
+
+#define extent_for_each_entry(_e, _entry) \
+ extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+/* Iterates through entries until it hits a pointer: */
+#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter) \
+({ \
+ __label__ out; \
+ const union bch_extent_entry *_entry; \
+ \
+ extent_for_each_entry_from(_e, _entry, (void *) _ptr) \
+ if (extent_entry_is_crc(_entry)) { \
+ (_crc) = (void *) _entry; \
+ } else { \
+ _ptr = (typeof(_ptr)) &_entry->ptr; \
+ if (_filter) \
+ goto out; \
+ } \
+ \
+ _ptr = NULL; \
+out: \
+ _ptr; \
+})
-#define __extent_ptr_next_online_device(_c, _extent, _ptr, _ca) \
+#define extent_ptr_next_filter(_e, _ptr, _filter) \
({ \
- (_ca) = NULL; \
+ union bch_extent_crc *_crc; \
\
- while ((_ptr) < (_extent).v->ptr + bch_extent_ptrs(_extent) && \
- !((_ca) = PTR_CACHE(_c, _ptr))) \
- (_ptr)++; \
- (_ca); \
+ extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter); \
})
-#define extent_for_each_online_device(_c, _extent, _ptr, _ca) \
- for ((_ptr) = (_extent).v->ptr; \
- ((_ca) = __extent_ptr_next_online_device(_c, _extent, \
- _ptr, _ca)); \
+#define extent_ptr_crc_next(_e, _crc, _ptr) \
+ extent_ptr_crc_next_filter(_e, _crc, _ptr, true)
+
+#define extent_ptr_next(_e, _ptr) \
+ extent_ptr_next_filter(_e, _ptr, true)
+
+#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter) \
+ for ((_crc) = NULL, \
+ (_ptr) = &(_e).v->start->ptr; \
+ ((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\
(_ptr)++)
+#define extent_for_each_ptr_from_filter(_e, _ptr, _start, _filter) \
+ for ((_ptr) = (_start); \
+ ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter)); \
+ (_ptr)++)
+
+#define extent_for_each_ptr_filter(_e, _ptr, _filter) \
+ extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr, _filter)
+
+#define extent_for_each_ptr_crc(_e, _ptr, _crc) \
+ extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true)
+
+#define extent_for_each_ptr_from(_e, _ptr, _start) \
+ extent_for_each_ptr_from_filter(_e, _ptr, _start, true)
+
+#define extent_for_each_ptr(_e, _ptr) \
+ extent_for_each_ptr_filter(_e, _ptr, true)
+
+#define extent_for_each_online_device_crc(_c, _e, _crc, _ptr, _ca) \
+ extent_for_each_ptr_crc_filter(_e, _ptr, _crc, \
+ ((_ca) = PTR_CACHE(_c, _ptr)))
+
+#define extent_for_each_online_device(_c, _e, _ptr, _ca) \
+ extent_for_each_ptr_filter(_e, _ptr, \
+ ((_ca) = PTR_CACHE(_c, _ptr)))
+
+#define extent_ptr_prev(_e, _ptr) \
+({ \
+ typeof(&(_e).v->start->ptr) _p; \
+ typeof(&(_e).v->start->ptr) _prev = NULL; \
+ \
+ extent_for_each_ptr(_e, _p) { \
+ if (_p == (_ptr)) \
+ break; \
+ _prev = _p; \
+ } \
+ \
+ _prev; \
+})
+
+/*
+ * Use this when you'll be dropping pointers as you iterate. Quadratic,
+ * unfortunately:
+ */
+#define extent_for_each_ptr_backwards(_e, _ptr) \
+ for ((_ptr) = extent_ptr_prev(_e, NULL); \
+ (_ptr); \
+ (_ptr) = extent_ptr_prev(_e, _ptr))
+
+/*
+ * make sure the type field gets set correctly:
+ */
+#define __extent_entry_append(_e, _type, _val) \
+do { \
+ union bch_extent_entry *_new = \
+ extent_entry_last(extent_i_to_s((_e))); \
+ \
+ (_e)->k.u64s += __extent_entry_u64s(BCH_EXTENT_ENTRY_##_type); \
+ BUG_ON(bkey_val_u64s(&(_e)->k) > BKEY_EXTENT_VAL_U64s_MAX); \
+ \
+ _new->_type = _val; \
+ _new->_type.type = 1 << BCH_EXTENT_ENTRY_##_type; \
+ \
+ BUG_ON(extent_entry_type(_new) != BCH_EXTENT_ENTRY_##_type); \
+} while (0)
+
+static inline void extent_crc32_append(struct bkey_i_extent *e,
+ struct bch_extent_crc32 crc)
+{
+ __extent_entry_append(e, crc32, crc);
+}
+
+static inline void extent_crc64_append(struct bkey_i_extent *e,
+ struct bch_extent_crc64 crc)
+{
+ __extent_entry_append(e, crc64, crc);
+}
+
+static inline void extent_ptr_append(struct bkey_i_extent *e,
+ struct bch_extent_ptr ptr)
+{
+ __extent_entry_append(e, ptr, ptr);
+}
+
+/* XXX: inefficient */
+static inline bool bch_extent_ptr_is_dirty(const struct cache_set *c,
+ struct bkey_s_c_extent e,
+ const struct bch_extent_ptr *ptr)
+{
+ const struct bch_extent_ptr *i;
+ unsigned seen = 0;
+
+ if (bkey_extent_is_cached(e.k))
+ return false;
+
+ /* Dirty pointers come last */
+ extent_for_each_ptr_from(e, i, ptr)
+ seen++;
+
+ return seen <= CACHE_SET_DATA_REPLICAS_WANT(&c->sb);
+}
+
+static inline struct bch_extent_crc64 crc_to_64(const union bch_extent_crc *crc)
+{
+ switch (bch_extent_crc_type(crc)) {
+ case BCH_EXTENT_CRC_NONE:
+ return (struct bch_extent_crc64) { 0 };
+ case BCH_EXTENT_CRC32:
+ return (struct bch_extent_crc64) {
+ .compressed_size = crc->crc32.compressed_size,
+ .uncompressed_size = crc->crc32.uncompressed_size,
+ .offset = crc->crc32.offset,
+ .csum_type = crc->crc32.csum_type,
+ .compression_type = crc->crc32.compression_type,
+ .csum = crc->crc32.csum,
+ };
+ case BCH_EXTENT_CRC64:
+ return crc->crc64;
+ default:
+ BUG();
+ }
+}
+
+void extent_adjust_pointers(struct bkey_s_extent, union bch_extent_entry *);
+
+/* Doesn't cleanup redundant crcs */
+static inline void __bch_extent_drop_ptr(struct bkey_s_extent e,
+ struct bch_extent_ptr *ptr)
+{
+ memmove(ptr, ptr + 1, (void *) extent_entry_last(e) - (void *) (ptr + 1));
+ e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+}
+
+void bch_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
bool bch_extent_has_device(struct bkey_s_c_extent, unsigned);
bool bch_cut_front(struct bpos, struct bkey_i *);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index d543344be4ab..c0d17ad94623 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -21,6 +21,7 @@
#include "super.h"
#include <linux/blkdev.h>
+#include <linux/zlib.h>
#include <trace/events/bcache.h>
@@ -61,29 +62,68 @@ void bch_bio_submit_work(struct work_struct *work)
}
}
-/* Bios with headers */
+/* Allocate, free from mempool: */
-void bch_bbio_prep(struct bbio *b, struct cache *ca)
+void bch_bio_free_pages_pool(struct cache_set *c, struct bio *bio)
{
- struct bvec_iter *iter = &b->bio.bi_iter;
+ struct bio_vec *bv;
+ unsigned i;
+
+ bio_for_each_segment_all(bv, bio, i)
+ mempool_free(bv->bv_page, &c->bio_bounce_pages);
+}
+
+static void bch_bio_alloc_page_pool(struct cache_set *c, struct bio *bio,
+ bool *using_mempool)
+{
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
+
+ if (likely(!*using_mempool)) {
+ bv->bv_page = alloc_page(GFP_NOIO);
+ if (unlikely(!bv->bv_page)) {
+ mutex_lock(&c->bio_bounce_pages_lock);
+ *using_mempool = true;
+ goto pool_alloc;
+
+ }
+ } else {
+pool_alloc:
+ bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+ }
+
+ bv->bv_len = PAGE_SIZE;
+ bv->bv_offset = 0;
+}
+
+static void bch_bio_alloc_pages_pool(struct cache_set *c, struct bio *bio,
+ size_t bytes)
+{
+ bool using_mempool = false;
+
+ bio->bi_iter.bi_size = bytes;
+
+ while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
+ bch_bio_alloc_page_pool(c, bio, &using_mempool);
+
+ if (using_mempool)
+ mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+/* Bios with headers */
+static void bch_bbio_prep(struct bbio *b, struct cache *ca)
+{
b->ca = ca;
b->bio.bi_iter.bi_sector = b->ptr.offset;
b->bio.bi_bdev = ca ? ca->disk_sb.bdev : NULL;
-
- b->bi_idx = iter->bi_idx;
- b->bi_bvec_done = iter->bi_bvec_done;
}
-/* XXX: should be bkey, not bkey_i */
-void bch_submit_bbio(struct bbio *b, struct cache *ca, const struct bkey_i *k,
+void bch_submit_bbio(struct bbio *b, struct cache *ca,
const struct bch_extent_ptr *ptr, bool punt)
{
struct bio *bio = &b->bio;
- b->key = *k;
b->ptr = *ptr;
- bch_set_extent_ptrs(bkey_i_to_s_extent(&b->key), 1);
bch_bbio_prep(b, ca);
b->submit_time_us = local_clock_us();
@@ -100,27 +140,28 @@ void bch_submit_bbio_replicas(struct bch_write_bio *bio, struct cache_set *c,
bool punt)
{
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+ const struct bch_extent_ptr *ptr;
struct cache *ca;
- unsigned ptr;
+ unsigned ptr_idx = 0;
BUG_ON(bio->orig);
- for (ptr = ptrs_from;
- ptr < bch_extent_ptrs(e);
- ptr++) {
+ extent_for_each_ptr(e, ptr) {
+ if (ptr_idx++ < ptrs_from)
+ continue;
+
rcu_read_lock();
- ca = PTR_CACHE(c, &e.v->ptr[ptr]);
+ ca = PTR_CACHE(c, ptr);
if (ca)
percpu_ref_get(&ca->ref);
rcu_read_unlock();
if (!ca) {
- bch_submit_bbio(&bio->bio, ca, k,
- &e.v->ptr[ptr], punt);
+ bch_submit_bbio(&bio->bio, ca, ptr, punt);
break;
}
- if (ptr + 1 < bch_extent_ptrs(e)) {
+ if (ptr + 1 < &extent_entry_last(e)->ptr) {
struct bch_write_bio *n =
to_wbio(bio_clone_fast(&bio->bio.bio, GFP_NOIO,
&ca->replica_set));
@@ -130,16 +171,17 @@ void bch_submit_bbio_replicas(struct bch_write_bio *bio, struct cache_set *c,
n->orig = &bio->bio.bio;
__bio_inc_remaining(n->orig);
- bch_submit_bbio(&n->bio, ca, k, &e.v->ptr[ptr], punt);
+ bch_submit_bbio(&n->bio, ca, ptr, punt);
} else {
- bch_submit_bbio(&bio->bio, ca, k,
- &e.v->ptr[ptr], punt);
+ bch_submit_bbio(&bio->bio, ca, ptr, punt);
}
}
}
static void bch_bbio_reset(struct bbio *b)
{
+ BUG();
+#if 0
struct bvec_iter *iter = &b->bio.bi_iter;
bio_reset(&b->bio);
@@ -147,6 +189,7 @@ static void bch_bbio_reset(struct bbio *b)
iter->bi_size = b->key.k.size << 9;
iter->bi_idx = b->bi_idx;
iter->bi_bvec_done = b->bi_bvec_done;
+#endif
}
/* IO errors */
@@ -268,25 +311,198 @@ static inline bool version_stress_test(struct cache_set *c)
#endif
}
-static void __bch_write(struct closure *);
-
-#if 0
-static void bio_csum(struct bio *bio, struct bkey *k)
+static u32 checksum_bio(struct bio *bio, unsigned type)
{
struct bio_vec bv;
struct bvec_iter iter;
- u64 crc = 0xffffffffffffffffULL;
+ u32 csum = U32_MAX;
+
+ if (type == BCH_CSUM_NONE)
+ return 0;
bio_for_each_segment(bv, bio, iter) {
- void *d = kmap(bv.bv_page) + bv.bv_offset;
+ void *p = kmap_atomic(bv.bv_page);
- crc = bch_checksum_update(KEY_CSUM(k), crc, d, bv.bv_len);
- kunmap(bv.bv_page);
+ csum = bch_checksum_update(type, csum,
+ p + bv.bv_offset,
+ bv.bv_len);
+ kunmap_atomic(p);
}
- k->val[bch_extent_ptrs(k)] = crc;
+ return csum ^= U32_MAX;
}
-#endif
+
+static int bio_compress_gzip(struct cache_set *c, struct bio *dst,
+ struct bio *src, unsigned output_available)
+{
+ struct bvec_iter src_iter = src->bi_iter;
+ z_stream strm;
+ struct page *workspace;
+ struct page *inp = NULL;
+ void *k_in = NULL;
+ bool using_mempool = false;
+ int ret;
+
+ BUG_ON(dst->bi_iter.bi_size);
+
+ workspace = mempool_alloc(&c->compression_workspace_pool, GFP_NOIO);
+ strm.workspace = page_address(workspace);
+
+ zlib_deflateInit(&strm, 3);
+ strm.next_in = NULL;
+ strm.next_out = NULL;
+ strm.avail_out = 0;
+ strm.avail_in = 0;
+
+ while (1) {
+ if (!strm.avail_out) {
+ struct bio_vec *bv = &dst->bi_io_vec[dst->bi_vcnt];
+
+ if (!output_available) {
+ /*
+ * XXX: this really shouldn't happen, accounting
+ * is screwed up somehow:
+ */
+ //pr_err("output_available == 0");
+ goto err;
+ }
+
+ BUG_ON(dst->bi_vcnt >= dst->bi_max_vecs);
+
+ if (k_in) {
+ kunmap_atomic(k_in);
+
+ bch_bio_alloc_page_pool(c, dst, &using_mempool);
+
+ strm.next_in = kmap_atomic(inp) +
+ (((unsigned long) strm.next_in) &
+ (PAGE_SIZE - 1));
+ } else {
+ bch_bio_alloc_page_pool(c, dst, &using_mempool);
+ }
+
+ strm.next_out = page_address(bv->bv_page);
+ strm.avail_out = min_t(unsigned, PAGE_SIZE,
+ output_available);
+
+ dst->bi_iter.bi_size += strm.avail_out;
+ output_available -= strm.avail_out;
+ }
+
+ if (!strm.avail_in && src_iter.bi_size &&
+ output_available > PAGE_SIZE * 3 / 2) {
+ struct bio_vec bv = bio_iter_iovec(src, src_iter);
+
+ if (k_in)
+ kunmap_atomic(k_in);
+
+ strm.avail_in = bv.bv_len;
+ inp = bv.bv_page;
+ k_in = kmap_atomic(inp);
+ strm.next_in = k_in + bv.bv_offset;
+
+ bio_advance_iter(src, &src_iter, strm.avail_in);
+ }
+
+ ret = zlib_deflate(&strm, strm.avail_in
+ ? Z_NO_FLUSH : Z_FINISH);
+ if (ret == Z_STREAM_END)
+ break;
+
+ BUG_ON(ret != Z_OK);
+ }
+
+ ret = zlib_deflateEnd(&strm);
+ BUG_ON(ret != Z_OK);
+
+ BUG_ON(strm.total_out > dst->bi_iter.bi_size);
+
+ /* caller will pad with 0s to block boundary */
+ dst->bi_iter.bi_size = strm.total_out;
+
+ /* return number of bytes consumed */
+ ret = src->bi_iter.bi_size - src_iter.bi_size;
+out:
+ if (k_in)
+ kunmap_atomic(k_in);
+ if (using_mempool)
+ mutex_unlock(&c->bio_bounce_pages_lock);
+ mempool_free(workspace, &c->compression_workspace_pool);
+
+ return ret;
+err:
+ ret = -1;
+ goto out;
+}
+
+static unsigned bio_compress(struct cache_set *c, struct bio *dst,
+ struct bio *src, unsigned *compression_type,
+ unsigned output_available)
+{
+ int ret = 0;
+
+ /* if it's only one block, don't bother trying to compress: */
+ if (bio_sectors(src) <= c->sb.block_size)
+ *compression_type = BCH_COMPRESSION_NONE;
+
+ switch (*compression_type) {
+ case BCH_COMPRESSION_NONE:
+ /* Just bounce it, for stable checksums: */
+copy:
+ bch_bio_alloc_pages_pool(c, dst, output_available);
+ bio_copy_data(dst, src);
+ return output_available;
+ case BCH_COMPRESSION_LZO1X:
+ BUG();
+ case BCH_COMPRESSION_GZIP:
+ ret = bio_compress_gzip(c, dst, src, output_available);
+ break;
+ case BCH_COMPRESSION_XZ:
+ BUG();
+ default:
+ BUG();
+ }
+
+ if (ret < 0) {
+ /* Failed to compress (didn't get smaller): */
+ *compression_type = BCH_COMPRESSION_NONE;
+ goto copy;
+ }
+
+ BUG_ON(ret & ((1 << (c->block_bits + 9)) - 1));
+
+ if (DIV_ROUND_UP(dst->bi_iter.bi_size, block_bytes(c)) >=
+ ret >> (c->block_bits + 9)) {
+ /* Failed to compress (didn't get smaller): */
+ *compression_type = BCH_COMPRESSION_NONE;
+ goto copy;
+ }
+
+ /* Pad to blocksize, and zero out padding: */
+ while (dst->bi_iter.bi_size & (block_bytes(c) - 1)) {
+ unsigned idx = dst->bi_iter.bi_size >> PAGE_SHIFT;
+ unsigned offset = dst->bi_iter.bi_size & (PAGE_SIZE - 1);
+ unsigned bytes = (PAGE_SIZE - offset) & (block_bytes(c) - 1);
+
+ if (idx < dst->bi_vcnt) {
+ struct bio_vec *bv = &dst->bi_io_vec[idx];
+
+ memset(page_address(bv->bv_page) + offset, 0, bytes);
+ } else {
+ dst->bi_io_vec[dst->bi_vcnt++] = (struct bio_vec) {
+ .bv_page = ZERO_PAGE(0),
+ .bv_len = PAGE_SIZE,
+ .bv_offset = 0,
+ };
+ }
+
+ dst->bi_iter.bi_size += bytes;
+ }
+
+ return ret;
+}
+
+static void __bch_write(struct closure *);
static void bch_write_done(struct closure *cl)
{
@@ -302,6 +518,11 @@ static void bch_write_done(struct closure *cl)
if (!op->write_done)
continue_at(cl, __bch_write, op->io_wq);
+ if (op->replace_collision) {
+ trace_bcache_promote_collision(&op->replace_info.key.k);
+ atomic_inc(&op->c->accounting.collector.cache_miss_collisions);
+ }
+
percpu_ref_put(&op->c->writes);
bch_keylist_free(&op->insert_keys);
closure_return(cl);
@@ -381,6 +602,7 @@ static void bch_write_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ struct bch_write_bio *wbio = to_wbio(bio);
if (bio->bi_error) {
/* TODO: We could try to recover from this. */
@@ -393,14 +615,277 @@ static void bch_write_endio(struct bio *bio)
set_closure_fn(cl, NULL, NULL);
}
- bch_bbio_endio(to_bbio(bio), bio->bi_error, "writing data to cache");
+ if (wbio->orig)
+ bio_endio(wbio->orig);
+ else if (wbio->bounce)
+ bch_bio_free_pages_pool(op->c, bio);
+
+ bch_bbio_endio(&wbio->bio, bio->bi_error, "writing data to cache");
+}
+
+static const unsigned bch_crc_size[] = {
+ [BCH_CSUM_NONE] = 0,
+ [BCH_CSUM_CRC32C] = 4,
+ [BCH_CSUM_CRC64] = 8,
+};
+
+/*
+ * We're writing another replica for this extent, so while we've got the data in
+ * memory we'll be computing a new checksum for the currently live data.
+ *
+ * If there are other replicas we aren't moving, and they are checksummed but
+ * not compressed, we can modify them to point to only the data that is
+ * currently live (so that readers won't have to bounce) while we've got the
+ * checksum we need:
+ *
+ * XXX: to guard against data being corrupted while in memory, instead of
+ * recomputing the checksum here, it would be better in the read path to instead
+ * of computing the checksum of the entire extent:
+ *
+ * | extent |
+ *
+ * compute the checksums of the live and dead data separately
+ * | dead data || live data || dead data |
+ *
+ * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
+ * use crc_live here (that we verified was correct earlier)
+ */
+static void extent_cleanup_checksums(struct bkey_s_extent e,
+ u64 csum, unsigned csum_type)
+{
+ union bch_extent_entry *entry;
+
+ extent_for_each_entry(e, entry)
+ switch (extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_ptr:
+ continue;
+ case BCH_EXTENT_ENTRY_crc32:
+ if (entry->crc32.compression_type != BCH_COMPRESSION_NONE ||
+ bch_crc_size[csum_type] > sizeof(entry->crc32.csum))
+ continue;
+
+ extent_adjust_pointers(e, entry);
+ entry->crc32.compressed_size = e.k->size;
+ entry->crc32.uncompressed_size = e.k->size;
+ entry->crc32.offset = 0;
+ entry->crc32.csum_type = csum_type;
+ entry->crc32.csum = csum;
+ break;
+ case BCH_EXTENT_ENTRY_crc64:
+ if (entry->crc64.compression_type != BCH_COMPRESSION_NONE ||
+ bch_crc_size[csum_type] > sizeof(entry->crc64.csum))
+ continue;
+
+ extent_adjust_pointers(e, entry);
+ entry->crc64.compressed_size = e.k->size;
+ entry->crc64.uncompressed_size = e.k->size;
+ entry->crc64.offset = 0;
+ entry->crc64.csum_type = csum_type;
+ entry->crc64.csum = csum;
+ break;
+ }
+}
+
+static void extent_checksum_append(struct bkey_i_extent *e,
+ unsigned compressed_size,
+ unsigned uncompressed_size,
+ unsigned compression_type,
+ u64 csum, unsigned csum_type)
+{
+ struct bch_extent_ptr *ptr;
+ union bch_extent_crc *crc;
+
+ BUG_ON(compressed_size > uncompressed_size);
+ BUG_ON(uncompressed_size != e->k.size);
+
+ /*
+ * Look up the last crc entry, so we can check if we need to add
+ * another:
+ */
+ extent_for_each_ptr_crc(extent_i_to_s(e), ptr, crc)
+ ;
+
+ switch (bch_extent_crc_type(crc)) {
+ case BCH_EXTENT_CRC_NONE:
+ if (csum_type == BCH_CSUM_NONE &&
+ compression_type == BCH_COMPRESSION_NONE)
+ return;
+ break;
+ case BCH_EXTENT_CRC32:
+ if (crc->crc32.compressed_size == compressed_size &&
+ crc->crc32.uncompressed_size == uncompressed_size &&
+ crc->crc32.offset == 0 &&
+ crc->crc32.compression_type == compression_type &&
+ crc->crc32.csum_type == csum_type &&
+ crc->crc32.csum == csum)
+ return;
+ break;
+ case BCH_EXTENT_CRC64:
+ if (crc->crc64.compressed_size == compressed_size &&
+ crc->crc64.uncompressed_size == uncompressed_size &&
+ crc->crc64.offset == 0 &&
+ crc->crc32.compression_type == compression_type &&
+ crc->crc64.csum_type == csum_type &&
+ crc->crc64.csum == csum)
+ return;
+ break;
+ }
+
+ switch (csum_type) {
+ case BCH_CSUM_NONE:
+ case BCH_CSUM_CRC32C:
+ BUG_ON(compressed_size > CRC32_EXTENT_SIZE_MAX ||
+ uncompressed_size > CRC32_EXTENT_SIZE_MAX);
+
+ extent_crc32_append(e, (struct bch_extent_crc32) {
+ .compressed_size = compressed_size,
+ .uncompressed_size = uncompressed_size,
+ .offset = 0,
+ .compression_type = compression_type,
+ .csum_type = csum_type,
+ .csum = csum,
+ });
+ break;
+ case BCH_CSUM_CRC64:
+ BUG_ON(compressed_size > CRC64_EXTENT_SIZE_MAX ||
+ uncompressed_size > CRC64_EXTENT_SIZE_MAX);
+
+ extent_crc64_append(e, (struct bch_extent_crc64) {
+ .compressed_size = compressed_size,
+ .uncompressed_size = uncompressed_size,
+ .offset = 0,
+ .compression_type = compression_type,
+ .csum_type = csum_type,
+ .csum = csum,
+ });
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void bch_write_extent(struct bch_write_op *op,
+ struct open_bucket *ob,
+ struct bkey_i *k, struct bio *orig)
+{
+ struct cache_set *c = op->c;
+ struct bio *bio;
+ struct bch_write_bio *wbio;
+ struct bkey_i_extent *e = bkey_i_to_extent(k);
+ struct bch_extent_ptr *ptr;
+ unsigned ptrs_from = 0;
+ unsigned csum_type = CACHE_DATA_PREFERRED_CSUM_TYPE(&c->sb);
+ unsigned compression_type = CACHE_COMPRESSION_TYPE(&c->sb);
+
+ /* don't refetch csum type/compression type */
+ barrier();
+
+ extent_for_each_ptr(extent_i_to_s(e), ptr)
+ ptrs_from++;
+
+ if (csum_type != BCH_CSUM_NONE ||
+ compression_type != BCH_COMPRESSION_NONE) {
+ /* all units here in bytes */
+ unsigned output_available, input_available, input_consumed;
+ u64 csum;
+
+ BUG_ON(bio_sectors(orig) != k->k.size);
+
+ /* XXX: decide extent size better: */
+ output_available = min(k->k.size,
+ min(ob->sectors_free,
+ CRC32_EXTENT_SIZE_MAX)) << 9;
+
+ input_available = min(orig->bi_iter.bi_size,
+ CRC32_EXTENT_SIZE_MAX << 9);
+
+ /*
+ * temporarily set input bio's size to the max we want to
+ * consume from it, in order to avoid overflow in the crc info
+ */
+ swap(orig->bi_iter.bi_size, input_available);
+
+ bio = bio_alloc_bioset(GFP_NOIO,
+ DIV_ROUND_UP(output_available, PAGE_SIZE),
+ &c->bio_write);
+ wbio = to_wbio(bio);
+ wbio->orig = NULL;
+ wbio->bounce = true;
+
+ input_consumed = bio_compress(c, bio, orig,
+ &compression_type,
+ output_available);
+
+ swap(orig->bi_iter.bi_size, input_available);
+
+ bch_key_resize(&k->k, input_consumed >> 9);
+ bio_advance(orig, input_consumed);
+
+ /*
+ * XXX: could move checksumming out from under the open
+ * bucket lock - but compression is also being done
+ * under it
+ */
+ csum = checksum_bio(bio, csum_type);
+
+ /*
+ * If possible, adjust existing pointers to only point to
+ * currently live data, while we have the checksum for that
+ * data:
+ */
+ extent_cleanup_checksums(extent_i_to_s(e), csum, csum_type);
+
+ /*
+ * Add a bch_extent_crc header for the pointers that
+ * bch_alloc_sectors_done() is going to append:
+ */
+ extent_checksum_append(e, bio_sectors(bio), e->k.size,
+ compression_type,
+ csum, csum_type);
+
+ bch_alloc_sectors_done(op->c, op->wp, k, ob, bio_sectors(bio));
+ } else {
+ if (k->k.size > ob->sectors_free)
+ bch_key_resize(&k->k, ob->sectors_free);
+
+ /*
+ * We might need a checksum entry, if there's a previous
+ * checksum entry we need to override:
+ */
+ extent_checksum_append(e, k->k.size, k->k.size,
+ compression_type, 0, csum_type);
+
+ bch_alloc_sectors_done(op->c, op->wp, k, ob, k->k.size);
+
+ bio = bio_next_split(orig, k->k.size, GFP_NOIO,
+ &op->c->bio_write);
+ if (bio == orig)
+ bio_get(bio);
+
+ wbio = to_wbio(bio);
+ wbio->orig = NULL;
+ wbio->bounce = false;
+ }
+
+ bio->bi_end_io = bch_write_endio;
+ bio->bi_private = &op->cl;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+#ifndef CONFIG_BCACHE_NO_IO
+ bch_submit_bbio_replicas(wbio, op->c, k, ptrs_from, false);
+#else
+ ptrs_from = ptrs_from;
+ bch_bbio_prep(&wbio->bio, NULL);
+ closure_get(bio->bi_private);
+ bio_endio(bio);
+#endif
}
static void __bch_write(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bio *bio = &op->bio->bio.bio, *n;
- unsigned open_bucket_nr = 0, ptrs_from;
+ struct bio *bio = &op->bio->bio.bio;
+ unsigned open_bucket_nr = 0;
struct open_bucket *b;
memset(op->open_buckets, 0, sizeof(op->open_buckets));
@@ -412,8 +897,9 @@ static void __bch_write(struct closure *cl)
continue_at(cl, bch_write_done, op->c->wq);
}
- bch_extent_drop_stale(op->c, bkey_i_to_s(&op->insert_key));
- ptrs_from = bch_extent_ptrs(bkey_i_to_s_extent(&op->insert_key));
+ if (bkey_extent_is_data(&op->insert_key.k))
+ bch_extent_drop_stale(op->c,
+ bkey_i_to_s_extent(&op->insert_key));
/*
* Journal writes are marked REQ_PREFLUSH; if the original write was a
@@ -438,9 +924,9 @@ static void __bch_write(struct closure *cl)
k = op->insert_keys.top;
bkey_copy(k, &op->insert_key);
- b = bch_alloc_sectors(op->c, op->wp, k,
- op->check_enospc,
- op->nowait ? NULL : cl);
+ b = bch_alloc_sectors_start(op->c, op->wp,
+ op->check_enospc,
+ op->nowait ? NULL : cl);
BUG_ON(!b);
if (PTR_ERR(b) == -EAGAIN) {
@@ -458,30 +944,15 @@ static void __bch_write(struct closure *cl)
op->open_buckets[open_bucket_nr++] = b;
+ /*
+ * XXX: if we compressed, we didn't use all the space we just
+ * allocated
+ */
+ bch_write_extent(op, b, k, bio);
bch_cut_front(k->k.p, &op->insert_key);
- n = bio_next_split(bio, k->k.size, GFP_NOIO,
- &op->c->bio_write);
- if (n == bio)
- bio_get(bio);
-
- n->bi_end_io = bch_write_endio;
- n->bi_private = cl;
-#if 0
- if (KEY_CSUM(k))
- bio_csum(n, k);
-#endif
- trace_bcache_cache_insert(&k->k);
-
- bio_set_op_attrs(n, REQ_OP_WRITE, 0);
-#ifndef CONFIG_BCACHE_NO_IO
- bch_submit_bbio_replicas(to_wbio(n), op->c, k,
- ptrs_from, false);
-#else
- bch_bbio_prep(to_bbio(n), NULL);
- closure_get(n->bi_private);
- bio_endio(n);
-#endif
+ BUG_ON(op->insert_key.k.size &&
+ op->insert_key.k.size != bio_sectors(bio));
BUG_ON(bch_extent_normalize(op->c, bkey_i_to_s(k)));
bch_check_mark_super(op->c, k, false);
@@ -489,7 +960,9 @@ static void __bch_write(struct closure *cl)
bkey_extent_set_cached(&k->k, op->cached);
bch_keylist_enqueue(&op->insert_keys);
- } while (n != bio);
+
+ trace_bcache_cache_insert(&k->k);
+ } while (op->insert_key.k.size);
op->write_done = true;
continue_at(cl, bch_write_index, op->c->wq);
@@ -775,69 +1248,17 @@ struct cache_promote_op {
struct closure cl;
struct bio *orig_bio;
struct bch_write_op iop;
- bool stale; /* was the ptr stale after the read? */
struct bch_write_bio bio; /* must be last */
};
static void cache_promote_done(struct closure *cl)
{
- struct cache_promote_op *op = container_of(cl,
- struct cache_promote_op, cl);
- struct cache_set *c = op->iop.c;
-
- if (op->iop.replace_collision) {
- trace_bcache_promote_collision(&op->iop.replace_info.key.k);
- atomic_inc(&c->accounting.collector.cache_miss_collisions);
- }
-
- bch_bio_free_pages(&op->iop.bio->bio.bio);
- kfree(op);
-}
-
-static void cache_promote_write(struct closure *cl)
-{
- struct cache_promote_op *op = container_of(cl,
- struct cache_promote_op, cl);
- struct bio *bio = &op->iop.bio->bio.bio;
-
- bio_reset(bio);
- bio->bi_iter.bi_sector = bkey_start_offset(&op->iop.insert_key.k);
- bio->bi_iter.bi_size = op->iop.insert_key.k.size << 9;
- /* needed to reinit bi_vcnt so pages can be freed later */
- bch_bio_map(bio, NULL);
-
- bio_copy_data(op->orig_bio, bio);
- op->orig_bio->bi_error = op->iop.error;
- bio_endio(op->orig_bio);
-
- if (!op->stale &&
- !op->iop.error &&
- !test_bit(CACHE_SET_RO, &op->iop.c->flags) &&
- !test_bit(CACHE_SET_STOPPING, &op->iop.c->flags))
- closure_call(&op->iop.cl, bch_write, NULL, cl);
-
- closure_return_with_destructor(cl, cache_promote_done);
-}
-
-static void cache_promote_endio(struct bio *bio)
-{
- struct bbio *b = to_bbio(bio);
struct cache_promote_op *op =
- container_of(bio, struct cache_promote_op, bio.bio.bio);
+ container_of(cl, struct cache_promote_op, cl);
- /*
- * If the bucket was reused while our bio was in flight, we might have
- * read the wrong data. Set s->error but not error so it doesn't get
- * counted against the cache device, but we'll still reread the data
- * from the backing device.
- */
-
- if (bio->bi_error)
- op->iop.error = bio->bi_error;
- else if (b->ca && ptr_stale(b->ca, &b->ptr))
- op->stale = 1;
-
- bch_bbio_endio(b, bio->bi_error, "reading from cache");
+ bch_bio_free_pages_pool(op->iop.c, op->orig_bio);
+ bio_put(op->orig_bio);
+ kfree(op);
}
/**
@@ -852,6 +1273,7 @@ void __cache_promote(struct cache_set *c, struct bbio *orig_bio,
struct bkey_s_c new,
unsigned write_flags)
{
+#if 0
struct cache_promote_op *op;
struct bio *bio;
unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE);
@@ -886,10 +1308,12 @@ void __cache_promote(struct cache_set *c, struct bbio *orig_bio,
op->stale = 0;
bch_write_op_init(&op->iop, c, &op->bio, &c->promote_write_point,
- new, old, BCH_WRITE_CHECK_ENOSPC|write_flags);
+ new, old,
+ BCH_WRITE_CHECK_ENOSPC|
+ BCH_WRITE_ALLOC_NOWAIT|write_flags);
- bch_cut_front(bkey_start_pos(&orig_bio->key.k), &op->iop.insert_key);
- bch_cut_back(orig_bio->key.k.p, &op->iop.insert_key.k);
+ //bch_cut_front(bkey_start_pos(&orig_bio->key.k), &op->iop.insert_key);
+ //bch_cut_back(orig_bio->key.k.p, &op->iop.insert_key.k);
trace_bcache_promote(&orig_bio->bio);
@@ -901,24 +1325,7 @@ out_free:
kfree(op);
out_submit:
generic_make_request(&orig_bio->bio);
-}
-
-/**
- * cache_promote - promote data stored in higher tiers
- *
- * Used for flash only volumes.
- *
- * @bio must actually be a bbio with valid key.
- */
-bool cache_promote(struct cache_set *c, struct bbio *bio, struct bkey_s_c k)
-{
- if (!CACHE_TIER(&bio->ca->mi)) {
- generic_make_request(&bio->bio);
- return 0;
- }
-
- __cache_promote(c, bio, k, k, BCH_WRITE_ALLOC_NOWAIT);
- return 1;
+#endif
}
/* Read */
@@ -927,36 +1334,325 @@ static void bch_read_requeue(struct cache_set *c, struct bio *bio)
{
unsigned long flags;
+ BUG();
+
spin_lock_irqsave(&c->read_race_lock, flags);
bio_list_add(&c->read_race_list, bio);
spin_unlock_irqrestore(&c->read_race_lock, flags);
queue_work(c->wq, &c->read_race_work);
}
-static void bch_read_endio(struct bio *bio)
+static int bio_uncompress_gzip(struct cache_set *c,
+ struct bio *dst, struct bvec_iter dst_iter,
+ struct bio *src, struct bvec_iter src_iter,
+ unsigned skip)
{
- struct bbio *b = to_bbio(bio);
- struct cache *ca = b->ca;
- struct bio *orig = bio->bi_private;
-
- bch_bbio_count_io_errors(b, bio->bi_error, "reading from cache");
-
- if (!bio->bi_error && ca &&
- (race_fault() ||
- ptr_stale(ca, &b->ptr))) {
- /* Read bucket invalidate race */
- atomic_long_inc(&ca->set->cache_read_races);
- bch_read_requeue(ca->set, bio);
+ z_stream strm;
+ struct page *workspace;
+ void *k_out = NULL;
+ u8 garbage[128];
+ int ret;
+ bool decompress_all = true;
+
+ workspace = mempool_alloc(&c->compression_workspace_pool, GFP_NOIO);
+ strm.workspace = page_address(workspace);
+
+ zlib_inflateInit(&strm);
+ strm.next_in = NULL;
+ strm.next_out = NULL;
+ strm.avail_out = 0;
+ strm.avail_in = 0;
+
+ do {
+ if (strm.avail_out) {
+ ;
+ } else if (skip) {
+ strm.avail_out = min_t(unsigned, sizeof(garbage), skip);
+ strm.next_out = garbage;
+
+ skip -= strm.avail_out;
+ } else if (dst_iter.bi_size) {
+ struct bio_vec bv = bio_iter_iovec(dst, dst_iter);
+
+ if (k_out)
+ kunmap_atomic(k_out);
+ k_out = kmap_atomic(bv.bv_page) + bv.bv_offset;
+
+ strm.avail_out = bv.bv_len;
+ strm.next_out = k_out;
+
+ bio_advance_iter(dst, &dst_iter, bv.bv_len);
+ } else {
+ /* Uncompressed all the data we actually want: */
+ if (!decompress_all) {
+ ret = Z_STREAM_END;
+ break;
+ }
+
+ strm.avail_out = sizeof(garbage);
+ strm.next_out = garbage;
+ }
+
+ if (!strm.avail_in && src_iter.bi_size) {
+ struct bio_vec bv = bio_iter_iovec(src, src_iter);
+
+ strm.avail_in = bv.bv_len;
+ strm.next_in = page_address(bv.bv_page) + bv.bv_offset;
+
+ bio_advance_iter(src, &src_iter, bv.bv_len);
+ }
+ } while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK);
+
+ if (k_out)
+ kunmap_atomic(k_out);
+
+ mempool_free(workspace, &c->compression_workspace_pool);
+
+ return ret == Z_STREAM_END ? 0 : -EIO;
+}
+
+static int bio_checksum_uncompress(struct bch_read_bio *rbio)
+{
+ struct bio *bio = &rbio->bio.bio;
+ int ret = 0;
+
+ /* reset iterator for checksum */
+ bio->bi_iter.bi_size = rbio->compressed_size << 9;
+ bio->bi_iter.bi_idx = 0;
+ bio->bi_iter.bi_bvec_done = 0;
+
+ if (rbio->csum_type != BCH_CSUM_NONE &&
+ rbio->csum != checksum_bio(bio, rbio->csum_type)) {
+ /*
+ * XXX: bch_bbio_count_io_errors() isn't counting checksum
+ * errors
+ */
+ __bcache_io_error(rbio->c, "checksum error");
+ return -EIO;
+ }
+
+ switch (rbio->compression_type) {
+ case BCH_COMPRESSION_NONE:
+ if (rbio->bounce) {
+ bio_advance(bio, rbio->offset << 9);
+ bio_copy_data_iter(rbio->parent, rbio->parent_iter,
+ bio, bio->bi_iter);
+ }
+ break;
+ case BCH_COMPRESSION_LZO1X:
+ BUG();
+ case BCH_COMPRESSION_GZIP:
+ ret = bio_uncompress_gzip(rbio->c,
+ rbio->parent,
+ rbio->parent_iter,
+ bio, bio->bi_iter,
+ rbio->offset << 9);
+ break;
+ case BCH_COMPRESSION_XZ:
+ BUG();
+ default:
+ BUG();
+ }
+
+ if (ret)
+ __bcache_io_error(rbio->c, "decompression error");
+
+ return ret;
+}
+
+/* Inner part that may run in process context */
+static void __bch_read_endio(struct bch_read_bio *rbio)
+{
+ struct bio *bio = &rbio->bio.bio;
+ int ret;
+
+ ret = bio_checksum_uncompress(rbio);
+ if (ret)
+ rbio->parent->bi_error = ret;
+ bio_endio(rbio->parent);
+
+ if (!ret && rbio->promote &&
+ !test_bit(CACHE_SET_RO, &rbio->c->flags) &&
+ !test_bit(CACHE_SET_STOPPING, &rbio->c->flags)) {
+ struct closure *cl = &rbio->promote->cl;
+
+ closure_init(cl, &rbio->c->cl);
+ closure_call(&rbio->promote->iop.cl, bch_write, rbio->c->wq, cl);
+ closure_return_with_destructor(cl, cache_promote_done);
} else {
- if (bio->bi_error)
- orig->bi_error = bio->bi_error;
+ if (rbio->promote)
+ kfree(rbio->promote);
+ if (rbio->bounce)
+ bch_bio_free_pages_pool(rbio->c, bio);
- bio_endio(orig);
bio_put(bio);
}
+}
- if (ca)
- percpu_ref_put(&ca->ref);
+void bch_bio_decompress_work(struct work_struct *work)
+{
+ struct bio_decompress_worker *d =
+ container_of(work, struct bio_decompress_worker, work);
+ struct llist_node *list, *next;
+ struct bch_read_bio *rbio;
+
+ while ((list = llist_del_all(&d->bio_list)))
+ for (list = llist_reverse_order(list);
+ list;
+ list = next) {
+ next = llist_next(list);
+ rbio = container_of(list, struct bch_read_bio, list);
+
+ __bch_read_endio(rbio);
+ }
+}
+
+static void bch_read_endio(struct bio *bio)
+{
+ struct bch_read_bio *rbio =
+ container_of(bio, struct bch_read_bio, bio.bio);
+ bool stale = //race_fault() ||
+ ptr_stale(rbio->bio.ca, &rbio->bio.ptr);
+ int error = bio->bi_error;
+
+ bch_bbio_count_io_errors(&rbio->bio, error, "reading from cache");
+ percpu_ref_put(&rbio->bio.ca->ref);
+
+ if (error)
+ goto out;
+
+ if (stale)
+ goto stale;
+
+ if (rbio->compression_type != BCH_COMPRESSION_NONE) {
+ struct bio_decompress_worker *d;
+
+ preempt_disable();
+ d = this_cpu_ptr(rbio->c->bio_decompress_worker);
+ llist_add(&rbio->list, &d->bio_list);
+ queue_work(system_unbound_wq, &d->work);
+ preempt_enable();
+ } else {
+ __bch_read_endio(rbio);
+ }
+
+ return;
+stale:
+ if (rbio->promote)
+ kfree(rbio->promote);
+ rbio->promote = NULL;
+
+ /* Raced with the bucket being reused and invalidated: */
+ if (rbio->flags & BCH_READ_RETRY_IF_STALE) {
+ atomic_long_inc(&rbio->c->cache_read_races);
+ bch_read_requeue(rbio->c, bio);
+ return;
+ }
+
+ error = -EINTR;
+out:
+ if (rbio->promote)
+ kfree(rbio->promote);
+ if (error)
+ rbio->parent->bi_error = error;
+ bio_endio(rbio->parent);
+ bio_put(bio);
+}
+
+void bch_read_extent(struct cache_set *c, struct bio *orig,
+ struct bkey_s_c k, struct extent_pick_ptr *pick,
+ unsigned skip, unsigned flags)
+{
+ struct bio *bio;
+ struct bch_read_bio *rbio;
+ struct cache_promote_op *promote_op = NULL;
+ bool bounce = false, read_full = false;
+
+ /* only promote if we're not reading from the fastest tier: */
+ if ((flags & BCH_READ_PROMOTE) && CACHE_TIER(&pick->ca->mi)) {
+ promote_op = kmalloc(sizeof(*promote_op), GFP_NOIO);
+
+ if (promote_op)
+ bounce = true;
+ }
+
+ /*
+ * note: if compression_type and crc_type both == none, then
+ * compressed/uncompressed size is zero
+ */
+ if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
+ (pick->crc.csum_type != BCH_CSUM_NONE &&
+ (bio_sectors(orig) != pick->crc.uncompressed_size ||
+ (flags & BCH_READ_FORCE_BOUNCE)))) {
+ read_full = true;
+ bounce = true;
+ }
+
+ if (bounce) {
+ unsigned sectors =
+ !read_full ? bio_sectors(orig)
+ : pick->crc.compressed_size ?: k.k->size;
+
+ bio = bio_alloc_bioset(GFP_NOIO,
+ DIV_ROUND_UP(sectors, PAGE_SECTORS),
+ &c->bio_read);
+ bch_bio_alloc_pages_pool(c, bio, sectors << 9);
+ } else {
+ bio = bio_clone_fast(orig, GFP_NOIO, &c->bio_read);
+ }
+
+ rbio = container_of(bio, struct bch_read_bio, bio.bio);
+ memset(rbio, 0, offsetof(struct bch_read_bio, bio));
+
+ rbio->csum = pick->crc.csum;
+ rbio->compressed_size = pick->crc.compressed_size;
+ rbio->uncompressed_size = pick->crc.uncompressed_size;
+ rbio->offset = pick->crc.offset;
+ rbio->csum_type = pick->crc.csum_type;
+ rbio->compression_type = pick->crc.compression_type;
+
+ __bio_inc_remaining(orig);
+ rbio->parent = orig;
+ rbio->parent_iter = orig->bi_iter;
+ rbio->c = c;
+ rbio->flags = flags;
+ rbio->bounce = bounce;
+ rbio->promote = promote_op;
+ rbio->bio.ptr = pick->ptr;
+ bio->bi_end_io = bch_read_endio;
+ bch_bbio_prep(&rbio->bio, pick->ca);
+
+ if (read_full)
+ rbio->offset += skip;
+ else
+ bio->bi_iter.bi_sector += skip;
+
+ if (promote_op) {
+ promote_op->orig_bio = bio;
+
+ bch_write_op_init(&promote_op->iop, c,
+ &promote_op->bio,
+ &c->promote_write_point,
+ k, k,
+ BCH_WRITE_CHECK_ENOSPC|
+ BCH_WRITE_ALLOC_NOWAIT);
+
+ if (!read_full) {
+ bch_cut_front(POS(k.k->p.inode,
+ bkey_start_offset(k.k) + skip),
+ &promote_op->iop.insert_key);
+ bch_key_resize(&promote_op->iop.insert_key.k,
+ bio_sectors(orig));
+ }
+
+ __bio_clone_fast(&promote_op->bio.bio.bio, bio);
+ }
+
+#ifndef CONFIG_BCACHE_NO_IO
+ generic_make_request(bio);
+#else
+ bio_endio(bio);
+#endif
}
/* XXX: this looks a lot like cache_lookup_fn() */
@@ -970,9 +1666,7 @@ int bch_read(struct cache_set *c, struct bio *bio, u64 inode)
for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
POS(inode, bio->bi_iter.bi_sector), k) {
struct extent_pick_ptr pick;
- struct bio *n;
- struct bbio *bbio;
- unsigned sectors;
+ unsigned bytes, sectors;
bool done;
BUG_ON(bkey_cmp(bkey_start_pos(k.k),
@@ -981,8 +1675,12 @@ int bch_read(struct cache_set *c, struct bio *bio, u64 inode)
BUG_ON(bkey_cmp(k.k->p,
POS(inode, bio->bi_iter.bi_sector)) <= 0);
- sectors = k.k->p.offset - bio->bi_iter.bi_sector;
- done = sectors >= bio_sectors(bio);
+ sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
+ bio->bi_iter.bi_sector;
+ bytes = sectors << 9;
+ done = bytes == bio->bi_iter.bi_size;
+
+ swap(bio->bi_iter.bi_size, bytes);
pick = bch_extent_pick_ptr(c, k);
if (IS_ERR(pick.ca)) {
@@ -994,43 +1692,19 @@ int bch_read(struct cache_set *c, struct bio *bio, u64 inode)
PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
c->prio_clock[READ].hand;
- n = sectors >= bio_sectors(bio)
- ? bio_clone_fast(bio, GFP_NOIO, &c->bio_split)
- : bio_split(bio, sectors, GFP_NOIO,
- &c->bio_split);
-
- n->bi_private = bio;
- n->bi_end_io = bch_read_endio;
- __bio_inc_remaining(bio);
-
- bbio = to_bbio(n);
- bbio->key.k = *k.k;
- bbio->ptr = pick.ptr;
- bch_set_extent_ptrs(bkey_i_to_s_extent(&bbio->key), 1);
-
- /* Trim the key to match what we're actually reading */
- bch_cut_front(POS(inode, n->bi_iter.bi_sector),
- &bbio->key);
- bch_cut_back(POS(inode, bio_end_sector(n)),
- &bbio->key.k);
- bch_bbio_prep(bbio, pick.ca);
-
-#ifndef CONFIG_BCACHE_NO_IO
- cache_promote(c, bbio, k);
-#else
- bio_endio(n);
-#endif
+ bch_read_extent(c, bio, k, &pick,
+ bio->bi_iter.bi_sector -
+ bkey_start_offset(k.k),
+ BCH_READ_FORCE_BOUNCE|
+ BCH_READ_RETRY_IF_STALE|
+ BCH_READ_PROMOTE);
} else {
- unsigned bytes = min_t(unsigned, sectors,
- bio_sectors(bio)) << 9;
-
- swap(bio->bi_iter.bi_size, bytes);
zero_fill_bio(bio);
- swap(bio->bi_iter.bi_size, bytes);
-
- bio_advance(bio, bytes);
}
+ swap(bio->bi_iter.bi_size, bytes);
+ bio_advance(bio, bytes);
+
if (done) {
bch_btree_iter_unlock(&iter);
return 0;
@@ -1069,7 +1743,8 @@ static void bch_read_retry(struct bbio *bbio)
* The inode, offset and size come from the bbio's key,
* which was set by bch_read_fn().
*/
- inode = bbio->key.k.p.inode;
+ BUG(); /* currently broken */
+ //inode = bbio->key.k.p.inode;
parent = bio->bi_private;
bch_bbio_reset(bbio);
diff --git a/drivers/md/bcache/io.h b/drivers/md/bcache/io.h
index fb024d2e5fa8..53c8b3aa07ea 100644
--- a/drivers/md/bcache/io.h
+++ b/drivers/md/bcache/io.h
@@ -1,17 +1,16 @@
#ifndef _BCACHE_IO_H
#define _BCACHE_IO_H
-struct bbio {
- struct cache *ca;
+#include <linux/zlib.h>
- unsigned int bi_idx; /* current index into bvl_vec */
+#define COMPRESSION_WORKSPACE_SIZE \
+ max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), \
+ zlib_inflate_workspacesize())
- unsigned int bi_bvec_done; /* number of bytes completed in
- current bvec */
- unsigned submit_time_us;
- struct bkey_i key;
+struct bbio {
+ struct cache *ca;
struct bch_extent_ptr ptr;
- /* Only ever have a single pointer (the one we're doing io to/from) */
+ unsigned submit_time_us;
struct bio bio;
};
@@ -95,6 +94,41 @@ void bch_write_op_init(struct bch_write_op *, struct cache_set *,
struct bkey_s_c, struct bkey_s_c, unsigned);
void bch_write(struct closure *);
+struct cache_promote_op;
+
+struct bch_read_bio {
+ struct bio *parent;
+ struct bvec_iter parent_iter;
+
+ struct cache_set *c;
+ unsigned flags;
+
+ /* fields align with bch_extent_crc64 */
+ u64 bounce:3,
+ compressed_size:18,
+ uncompressed_size:18,
+ offset:17,
+ csum_type:4,
+ compression_type:4;
+ u64 csum;
+
+ struct cache_promote_op *promote;
+
+ struct llist_node list;
+ struct bbio bio;
+};
+
+struct extent_pick_ptr;
+
+void bch_read_extent(struct cache_set *, struct bio *, struct bkey_s_c,
+ struct extent_pick_ptr *, unsigned, unsigned);
+
+enum bch_read_flags {
+ BCH_READ_FORCE_BOUNCE = 1 << 0,
+ BCH_READ_RETRY_IF_STALE = 1 << 1,
+ BCH_READ_PROMOTE = 1 << 2,
+};
+
int bch_read(struct cache_set *, struct bio *, u64);
void bch_cache_io_error_work(struct work_struct *);
@@ -104,8 +138,7 @@ void bch_bbio_endio(struct bbio *, int, const char *);
void bch_generic_make_request(struct bio *, struct cache_set *);
void bch_bio_submit_work(struct work_struct *);
-void bch_bbio_prep(struct bbio *, struct cache *);
-void bch_submit_bbio(struct bbio *, struct cache *, const struct bkey_i *,
+void bch_submit_bbio(struct bbio *, struct cache *,
const struct bch_extent_ptr *, bool);
void bch_submit_bbio_replicas(struct bch_write_bio *, struct cache_set *,
const struct bkey_i *, unsigned, bool);
@@ -119,6 +152,8 @@ bool cache_promote(struct cache_set *, struct bbio *, struct bkey_s_c);
void bch_read_race_work(struct work_struct *);
void bch_wake_delayed_writes(unsigned long data);
+void bch_bio_decompress_work(struct work_struct *);
+
extern struct workqueue_struct *bcache_io_wq;
#endif /* _BCACHE_IO_H */
diff --git a/drivers/md/bcache/io_types.h b/drivers/md/bcache/io_types.h
new file mode 100644
index 000000000000..2a8e7c6a7386
--- /dev/null
+++ b/drivers/md/bcache/io_types.h
@@ -0,0 +1,12 @@
+#ifndef _BCACHE_IO_TYPES_H
+#define _BCACHE_IO_TYPES_H
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bio_decompress_worker {
+ struct work_struct work;
+ struct llist_head bio_list;
+};
+
+#endif /* _BCACHE_IO_TYPES_H */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 0a7550a0294d..08879ba95be7 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -1170,7 +1170,7 @@ static void journal_next_bucket(struct cache_set *c)
struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
struct bch_extent_ptr *ptr;
struct cache *ca;
- unsigned iter;
+ unsigned iter, replicas;
lockdep_assert_held(&j->lock);
@@ -1198,7 +1198,11 @@ static void journal_next_bucket(struct cache_set *c)
if (!(ca = PTR_CACHE(c, ptr)) ||
CACHE_STATE(&ca->mi) != CACHE_ACTIVE ||
ca->journal.sectors_free <= j->sectors_free)
- bch_extent_drop_ptr(e, ptr);
+ __bch_extent_drop_ptr(e, ptr);
+
+ replicas = 0;
+ extent_for_each_ptr(e, ptr)
+ replicas++;
/*
* Determine location of the next journal write:
@@ -1209,7 +1213,7 @@ static void journal_next_bucket(struct cache_set *c)
unsigned next, remaining, nr_buckets =
bch_nr_journal_buckets(&ca->sb);
- if (bch_extent_ptrs(e) == CACHE_SET_META_REPLICAS_WANT(&c->sb))
+ if (replicas >= CACHE_SET_META_REPLICAS_WANT(&c->sb))
break;
/*
@@ -1247,28 +1251,25 @@ static void journal_next_bucket(struct cache_set *c)
if (!remaining)
continue;
- BUG_ON(bch_extent_ptrs(e) >= BKEY_EXTENT_PTRS_MAX);
-
ja->sectors_free = ca->mi.bucket_size;
-
ja->cur_idx = next;
- e.v->ptr[bch_extent_ptrs(e)] = (struct bch_extent_ptr) {
- .gen = 0,
- .dev = ca->sb.nr_this_dev,
- .offset = bucket_to_sector(ca,
- journal_bucket(ca, ja->cur_idx)),
- };
-
ja->bucket_seq[ja->cur_idx] = j->seq;
+ extent_ptr_append(bkey_i_to_extent(&j->key),
+ (struct bch_extent_ptr) {
+ .offset = bucket_to_sector(ca,
+ journal_bucket(ca, ja->cur_idx)),
+ .dev = ca->sb.nr_this_dev,
+ });
+ replicas++;
+
trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
- bch_set_extent_ptrs(e, bch_extent_ptrs(e) + 1);
}
/* set j->sectors_free to the min of any device */
j->sectors_free = UINT_MAX;
- if (bch_extent_ptrs(e) == CACHE_SET_META_REPLICAS_WANT(&c->sb))
+ if (replicas >= CACHE_SET_META_REPLICAS_WANT(&c->sb))
extent_for_each_online_device(c, e, ptr, ca)
j->sectors_free = min(j->sectors_free,
ca->journal.sectors_free);
diff --git a/drivers/md/bcache/migrate.c b/drivers/md/bcache/migrate.c
index 66bf35c082c5..4ee369a4b7a9 100644
--- a/drivers/md/bcache/migrate.c
+++ b/drivers/md/bcache/migrate.c
@@ -136,14 +136,6 @@ static enum migrate_option migrate_cleanup_key(struct cache_set *c,
return MIGRATE_IGNORE;
}
- /*
- * Remove all pointers, to avoid too many in a tier.
- * migrate_compact_key above does the same when nr_replicas is 1, and
- * doesn't actually work if nr_replicas > 1, so do something simple
- * instead. Effectively, every migration copy is a fresh 'foreground'
- * write.
- */
- bch_set_extent_ptrs(e, 0);
return MIGRATE_COPY;
}
diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c
index 2aed02880a36..87dcac33cb4b 100644
--- a/drivers/md/bcache/move.c
+++ b/drivers/md/bcache/move.c
@@ -427,9 +427,8 @@ void bch_queue_recalc_oldest_gens(struct cache_set *c, struct moving_queue *q)
static void read_moving_endio(struct bio *bio)
{
- struct bbio *b = container_of(bio, struct bbio, bio);
- struct moving_io *io = container_of(bio->bi_private,
- struct moving_io, cl);
+ struct closure *cl = bio->bi_private;
+ struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_queue *q = io->q;
struct moving_context *ctxt = io->context;
bool stopped;
@@ -439,11 +438,9 @@ static void read_moving_endio(struct bio *bio)
if (bio->bi_error) {
io->op.error = bio->bi_error;
moving_error(io->context, MOVING_FLAG_READ);
- } else if (ptr_stale(b->ca, &bkey_i_to_extent_c(&b->key)->v.ptr[0])) {
- io->op.error = -EINTR;
}
- bch_bbio_endio(b, bio->bi_error, "reading data to move");
+ bio_put(bio);
spin_lock_irqsave(&q->lock, flags);
@@ -488,7 +485,10 @@ static void __bch_data_move(struct closure *cl)
bio_set_op_attrs(&io->bio.bio.bio, REQ_OP_READ, 0);
io->bio.bio.bio.bi_end_io = read_moving_endio;
- bch_submit_bbio(&io->bio.bio, pick.ca, &io->key, &pick.ptr, false);
+ bch_read_extent(io->op.c, &io->bio.bio.bio,
+ bkey_i_to_s_c(&io->key),
+ &pick, 0, 0);
+ bio_endio(&io->bio.bio.bio);
}
/*
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 17a910ef114e..167c2f185f0e 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -364,9 +364,9 @@ static int cached_dev_cache_miss(struct btree_iter *iter, struct search *s,
miss->bi_end_io = request_endio;
miss->bi_private = &s->cl;
- to_bbio(miss)->key.k = KEY(s->inode,
- bio_end_sector(miss),
- bio_sectors(miss));
+ //to_bbio(miss)->key.k = KEY(s->inode,
+ // bio_end_sector(miss),
+ // bio_sectors(miss));
to_bbio(miss)->ca = NULL;
closure_get(&s->cl);
@@ -375,7 +375,7 @@ static int cached_dev_cache_miss(struct btree_iter *iter, struct search *s,
bkey_to_s_c(&KEY(replace.key.k.p.inode,
replace.key.k.p.offset,
replace.key.k.size)),
- BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED);
+ BCH_WRITE_CACHED);
return 0;
nopromote:
@@ -388,23 +388,6 @@ nopromote:
return 0;
}
-static void bch_cache_read_endio(struct bio *bio)
-{
- struct bbio *b = to_bbio(bio);
- struct closure *cl = bio->bi_private;
- struct search *s = container_of(cl, struct search, cl);
-
- if (bio->bi_error)
- s->iop.error = bio->bi_error;
- else if (ptr_stale(b->ca, &b->ptr)) {
- /* Read bucket invalidate race */
- atomic_long_inc(&s->iop.c->cache_read_races);
- s->iop.error = -EINTR;
- }
-
- bch_bbio_endio(b, bio->bi_error, "reading from cache");
-}
-
static void cached_dev_read(struct cached_dev *dc, struct search *s)
{
struct closure *cl = &s->cl;
@@ -417,9 +400,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
for_each_btree_key_with_holes(&iter, s->iop.c, BTREE_ID_EXTENTS,
POS(s->inode, bio->bi_iter.bi_sector), k) {
struct extent_pick_ptr pick;
- struct bio *n;
- struct bbio *bbio;
- unsigned sectors;
+ unsigned sectors, bytes;
bool done;
retry:
BUG_ON(bkey_cmp(bkey_start_pos(k.k),
@@ -428,8 +409,12 @@ retry:
BUG_ON(bkey_cmp(k.k->p,
POS(s->inode, bio->bi_iter.bi_sector)) <= 0);
- sectors = k.k->p.offset - bio->bi_iter.bi_sector;
- done = sectors >= bio_sectors(bio);
+ sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
+ bio->bi_iter.bi_sector;
+ bytes = sectors << 9;
+ done = bytes == bio->bi_iter.bi_size;
+
+ swap(bio->bi_iter.bi_size, bytes);
pick = bch_extent_pick_ptr(s->iop.c, k);
if (IS_ERR(pick.ca)) {
@@ -452,33 +437,17 @@ retry:
if (!bkey_extent_is_cached(k.k))
s->read_dirty_data = true;
- n = bio_next_split(bio, sectors, GFP_NOIO,
- &s->d->bio_split);
-
- bbio = to_bbio(n);
- bbio->key.k = *k.k;
- bbio->ptr = pick.ptr;
- bch_set_extent_ptrs(bkey_i_to_s_extent(&bbio->key), 1);
-
- /* Trim the key to match what we're actually reading */
- bch_cut_front(POS(s->inode, n->bi_iter.bi_sector),
- &bbio->key);
- bch_cut_back(POS(s->inode, bio_end_sector(n)),
- &bbio->key.k);
-
- bch_bbio_prep(bbio, pick.ca);
-
- n->bi_end_io = bch_cache_read_endio;
- n->bi_private = &s->cl;
-
- closure_get(&s->cl);
- if (!s->bypass) {
- if (cache_promote(s->iop.c, bbio, k))
- s->cache_miss = 1;
- } else
- submit_bio(n);
+ bch_read_extent(s->iop.c, bio, k, &pick,
+ bio->bi_iter.bi_sector -
+ bkey_start_offset(k.k),
+ BCH_READ_FORCE_BOUNCE|
+ BCH_READ_RETRY_IF_STALE|
+ (!s->bypass ? BCH_READ_PROMOTE : 0));
}
+ swap(bio->bi_iter.bi_size, bytes);
+ bio_advance(bio, bytes);
+
if (done) {
bch_btree_iter_unlock(&iter);
goto out;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 61047a66fb0c..05c62ca25f74 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -142,6 +142,8 @@ static const char *bch_blkdev_open(const char *path, void *holder,
if (IS_ERR(bdev))
return "failed to open device";
+ bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
+
*ret = bdev;
return NULL;
}
@@ -764,9 +766,12 @@ static void cache_set_free(struct closure *cl)
bch_io_clock_exit(&c->io_clock[WRITE]);
bch_io_clock_exit(&c->io_clock[READ]);
bdi_destroy(&c->bdi);
- bioset_exit(&c->btree_read_bio);
+ free_percpu(c->bio_decompress_worker);
+ mempool_exit(&c->compression_workspace_pool);
+ mempool_exit(&c->bio_bounce_pages);
bioset_exit(&c->bio_write);
- bioset_exit(&c->bio_split);
+ bioset_exit(&c->bio_read);
+ bioset_exit(&c->btree_read_bio);
mempool_exit(&c->btree_reserve_pool);
mempool_exit(&c->fill_iter);
mempool_exit(&c->search);
@@ -893,6 +898,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
{
struct cache_set *c;
unsigned iter_size;
+ int cpu;
c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
if (!c)
@@ -952,9 +958,9 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
INIT_LIST_HEAD(&c->btree_cache_freeable);
INIT_LIST_HEAD(&c->btree_cache_freed);
+ mutex_init(&c->bio_bounce_pages_lock);
INIT_WORK(&c->bio_submit_work, bch_bio_submit_work);
spin_lock_init(&c->bio_submit_lock);
-
bio_list_init(&c->read_race_list);
spin_lock_init(&c->read_race_lock);
INIT_WORK(&c->read_race_work, bch_read_race_work);
@@ -992,9 +998,14 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
BTREE_RESERVE_SIZE) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
- bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio)) ||
- bioset_init(&c->bio_write, 4, offsetof(struct bch_write_bio, bio.bio)) ||
bioset_init(&c->btree_read_bio, 1, offsetof(struct bbio, bio)) ||
+ bioset_init(&c->bio_read, 4, offsetof(struct bch_read_bio, bio.bio)) ||
+ bioset_init(&c->bio_write, 4, offsetof(struct bch_write_bio, bio.bio)) ||
+ mempool_init_page_pool(&c->bio_bounce_pages,
+ CRC32_EXTENT_SIZE_MAX / PAGE_SECTORS, 0) ||
+ mempool_init_page_pool(&c->compression_workspace_pool, 1,
+ get_order(COMPRESSION_WORKSPACE_SIZE)) ||
+ !(c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker)) ||
bdi_setup_and_register(&c->bdi, "bcache") ||
bch_io_clock_init(&c->io_clock[READ]) ||
bch_io_clock_init(&c->io_clock[WRITE]) ||
@@ -1003,9 +1014,18 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
bch_bset_sort_state_init(&c->sort, ilog2(btree_pages(c))))
goto err;
+ for_each_possible_cpu(cpu) {
+ struct bio_decompress_worker *d =
+ per_cpu_ptr(c->bio_decompress_worker, cpu);
+
+ INIT_WORK(&d->work, bch_bio_decompress_work);
+ init_llist_head(&d->bio_list);
+ }
+
c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
c->bdi.congested_fn = bch_congested_fn;
c->bdi.congested_data = c;
+ c->bdi.capabilities |= BDI_CAP_STABLE_WRITES;
return c;
err:
diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c
index ed05c6f4d412..62108446b82d 100644
--- a/drivers/md/bcache/tier.c
+++ b/drivers/md/bcache/tier.c
@@ -24,30 +24,23 @@ static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k)
if (bkey_extent_is_data(k.k)) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const struct bch_extent_ptr *ptr;
struct cache_member_rcu *mi;
- unsigned replicas = CACHE_SET_DATA_REPLICAS_WANT(&c->sb);
- unsigned dev;
- bool ret = false;
+ unsigned replicas = 0;
- /*
- * Should not happen except in a pathological situation (too
- * many pointers on the wrong tier?
- */
- if (bch_extent_ptrs(e) == BKEY_EXTENT_PTRS_MAX)
+ /* Make sure we have room to add a new pointer: */
+ if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_MAX_U64s >
+ BKEY_EXTENT_VAL_U64s_MAX)
return false;
- /*
- * Need at least CACHE_SET_DATA_REPLICAS_WANT ptrs not on tier 0
- */
- if (bch_extent_ptrs(e) < replicas)
- return true;
-
- dev = e.v->ptr[bch_extent_ptrs(e) - replicas].dev;
mi = cache_member_info_get(c);
- ret = dev < mi->nr_in_set && !CACHE_TIER(&mi->m[dev]);
+ extent_for_each_ptr(e, ptr)
+ if (ptr->dev < mi->nr_in_set &&
+ CACHE_TIER(&mi->m[ptr->dev]))
+ replicas++;
cache_member_info_put();
- return ret;
+ return replicas < CACHE_SET_DATA_REPLICAS_WANT(&c->sb);
}
return false;
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index f03453ee69f1..2ca58a386cdf 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -621,13 +621,11 @@ TRACE_EVENT(bcache_btree_insert_key,
__field(u64, b_bucket )
__field(u64, b_offset )
__field(u64, offset )
- __field(u64, bucket )
__field(u32, b_inode )
__field(u32, inode )
__field(u32, size )
__field(u8, level )
__field(u8, id )
- __field(u8, cached )
__field(u8, op )
__field(u8, insert_done )
),
@@ -638,22 +636,18 @@ TRACE_EVENT(bcache_btree_insert_key,
__entry->id = b->btree_id;
__entry->b_inode = b->key.k.p.inode;
__entry->b_offset = b->key.k.p.offset;
- __entry->bucket = PTR_BUCKET_NR_TRACE(b->c, k, 0);
__entry->inode = k->k.p.inode;
__entry->offset = k->k.p.offset;
__entry->size = k->k.size;
- __entry->cached = bkey_extent_is_cached(&k->k);
__entry->op = op;
__entry->insert_done = insert_done;
),
- TP_printk("%u for %u bucket %llu(%u) id %u: %u:%llu %u:%llu len %u%s -> %llu",
+ TP_printk("%u for %u bucket %llu(%u) id %u: %u:%llu %u:%llu len %u",
__entry->insert_done, __entry->op,
__entry->b_bucket, __entry->level, __entry->id,
__entry->b_inode, __entry->b_offset,
- __entry->inode, __entry->offset,
- __entry->size, __entry->cached ? " cached" : "",
- __entry->bucket)
+ __entry->inode, __entry->offset, __entry->size)
);
DECLARE_EVENT_CLASS(btree_split,
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 8ea6758301a7..a5ab2935c146 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -244,19 +244,139 @@ BKEY_VAL_TYPE(cookie, KEY_TYPE_COOKIE);
/* Extents */
/*
- * bcache keys index the end of the extent as the offset
- * The end is exclusive, while the start is inclusive
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the start of the data that
+ * is currently live. The size field in struct bkey records the current (live)
+ * size of the extent, and is also used to mean "size of region on disk that we
+ * point to" in this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32 - 0b1
+ * bch_extent_ptr - 0b10
+ * bch_extent_crc64 - 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
*/
+enum bch_extent_entry_type {
+ BCH_EXTENT_ENTRY_crc32 = 0,
+ BCH_EXTENT_ENTRY_ptr = 1,
+ BCH_EXTENT_ENTRY_crc64 = 2,
+};
+
+#define BCH_EXTENT_ENTRY_MAX 3
+
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u32 type:1,
+ offset:7,
+ compressed_size:8,
+ uncompressed_size:8,
+ csum_type:4,
+ compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u32 csum_type:4,
+ compression_type:4,
+ uncompressed_size:8,
+ compressed_size:8,
+ offset:7,
+ type:1;
+#endif
+ __u32 csum;
+} __attribute__((packed)) __attribute__((aligned(8)));
+
+#define CRC32_EXTENT_SIZE_MAX (1U << 7)
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:3,
+ compressed_size:18,
+ uncompressed_size:18,
+ offset:17,
+ csum_type:4,
+ compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 csum_type:4,
+ compression_type:4,
+ offset:17,
+ uncompressed_size:18,
+ compressed_size:18,
+ type:3;
+#endif
+ __u64 csum;
+} __attribute__((packed)) __attribute__((aligned(8)));
+
+#define CRC64_EXTENT_SIZE_MAX (1U << 17)
+
struct bch_extent_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 gen:8,
+ __u64 type:2,
+ erasure_coded:1,
+ offset:45, /* 16 petabytes */
dev:8,
- offset:48;
+ gen:8;
#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 offset:48,
+ __u64 gen:8,
dev:8,
- gen:8;
+ offset:45,
+ erasure_coded:1,
+ type:2;
#endif
} __attribute__((packed)) __attribute__((aligned(8)));
@@ -264,6 +384,13 @@ struct bch_extent_ptr {
#define PTR_LOST_DEV 255 /* XXX: kill */
+union bch_extent_entry {
+ __u8 type;
+ struct bch_extent_crc32 crc32;
+ struct bch_extent_crc64 crc64;
+ struct bch_extent_ptr ptr;
+};
+
enum {
BCH_EXTENT = 128,
@@ -277,9 +404,10 @@ enum {
struct bch_extent {
struct bch_val v;
- struct bch_extent_ptr ptr[0];
+
+ union bch_extent_entry start[0];
__u64 _data[0];
-};
+} __attribute__((packed)) __attribute__((aligned(8)));
BKEY_VAL_TYPE(extent, BCH_EXTENT);
/* Inodes */
@@ -552,6 +680,18 @@ enum {
BCH_DIRENT_CSUM_SHA1 = 3,
};
+BITMASK(CACHE_DATA_PREFERRED_CSUM_TYPE, struct cache_sb, flags, 48, 52);
+
+BITMASK(CACHE_COMPRESSION_TYPE, struct cache_sb, flags, 52, 56);
+enum {
+ BCH_COMPRESSION_NONE = 0,
+ BCH_COMPRESSION_LZO1X = 1,
+ BCH_COMPRESSION_GZIP = 2,
+ BCH_COMPRESSION_XZ = 3,
+};
+
+/* backing device specific stuff: */
+
BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
#define CACHE_MODE_WRITETHROUGH 0U
#define CACHE_MODE_WRITEBACK 1U