bcache: data checksumming

author: Kent Overstreet <kent.overstreet@gmail.com> 2015-05-28 23:57:20 -0700
committer: Kent Overstreet <kent.overstreet@gmail.com> 2016-10-07 12:34:20 -0800
commit: 5749e6138348d7c1546e28b4ac0ae9032c94e0c0 (patch)
tree: ac6f6ce3dced78b075c0b54ba5e0ad9a728a9951
parent: 9d5c579320bcf93315155a30a1b6b975f2811468 (diff)
23 files changed, 1983 insertions, 651 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 55e135f6dd61..0f9410c06c45 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -3,6 +3,8 @@ config BCACHE
 	tristate "Block device as cache"
 	select LIBCRC32C
 	select FS_POSIX_ACL
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
 	---help---
 	Allows a block device to be used as cache for other devices; uses
 	a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index d567f4ae6df3..7b51888c5968 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -1271,59 +1271,43 @@ static void verify_not_stale(struct cache_set *c, const struct open_bucket *ob)
 }
 
 /*
- * Allocates some space in the cache to write to, and k to point to the newly
- * allocated space, and updates k->size and k->offset (to point to the
- * end of the newly allocated space).
- *
- * May allocate fewer sectors than @sectors, k->size indicates how many
- * sectors were actually allocated.
- *
- * Return codes:
- * - -EAGAIN: closure was added to waitlist
- * - -ENOSPC: out of space and no closure provided
- *
- * @c  - cache set.
- * @wp - write point to use for allocating sectors.
- * @k  - key to return the allocated space information.
- * @cl - closure to wait for a bucket
+ * Get us an open_bucket we can allocate from, return with it locked:
  */
-struct open_bucket *bch_alloc_sectors(struct cache_set *c,
-				      struct write_point *wp,
-				      struct bkey_i *k,
-				      bool check_enospc,
-				      struct closure *cl)
+struct open_bucket *bch_alloc_sectors_start(struct cache_set *c,
+					    struct write_point *wp,
+					    bool check_enospc,
+					    struct closure *cl)
 {
-	struct bkey_s_extent dst;
-	struct bch_extent_ptr *ptr;
 	struct open_bucket *ob;
-	struct cache *ca;
-	unsigned sectors;
 
 	ob = lock_and_refill_writepoint(c, wp, check_enospc, cl);
 	if (IS_ERR_OR_NULL(ob))
 		return ob;
 
 	BUG_ON(!ob->sectors_free);
-
 	verify_not_stale(c, ob);
 
+	return ob;
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch_alloc_sectors_done(struct cache_set *c, struct write_point *wp,
+			    struct bkey_i *k, struct open_bucket *ob,
+			    unsigned sectors)
+{
+	struct bch_extent_ptr *ptr;
+	struct cache *ca;
+	unsigned i;
+
 	/*
 	 * We're keeping any existing pointer k has, and appending new pointers:
 	 * __bch_write() will only write to the pointers we add here:
 	 */
-	dst = bkey_i_to_s_extent(k);
-
-	/* Set up the pointer to the space we're allocating: */
-	memcpy(&dst.v->ptr[bch_extent_ptrs(dst)],
-	       ob->ptrs, ob->nr_ptrs * sizeof(u64));
-
-	bch_set_extent_ptrs(dst, bch_extent_ptrs(dst) + ob->nr_ptrs);
-
-	sectors = min_t(unsigned, dst.k->size, ob->sectors_free);
-
-	bch_key_resize(dst.k, sectors);
-
-	/* update open bucket for next time: */
+	for (i = 0; i < ob->nr_ptrs; i++)
+		extent_ptr_append(bkey_i_to_extent(k), ob->ptrs[i]);
 
 	ob->sectors_free -= sectors;
 	if (ob->sectors_free)
@@ -1341,6 +1325,41 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c,
 	rcu_read_unlock();
 
 	mutex_unlock(&ob->lock);
+}
+
+/*
+ * Allocates some space in the cache to write to, and k to point to the newly
+ * allocated space, and updates k->size and k->offset (to point to the
+ * end of the newly allocated space).
+ *
+ * May allocate fewer sectors than @sectors, k->size indicates how many
+ * sectors were actually allocated.
+ *
+ * Return codes:
+ * - -EAGAIN: closure was added to waitlist
+ * - -ENOSPC: out of space and no closure provided
+ *
+ * @c  - cache set.
+ * @wp - write point to use for allocating sectors.
+ * @k  - key to return the allocated space information.
+ * @cl - closure to wait for a bucket
+ */
+struct open_bucket *bch_alloc_sectors(struct cache_set *c,
+				      struct write_point *wp,
+				      struct bkey_i *k,
+				      bool check_enospc,
+				      struct closure *cl)
+{
+	struct open_bucket *ob;
+
+	ob = bch_alloc_sectors_start(c, wp, check_enospc, cl);
+	if (IS_ERR_OR_NULL(ob))
+		return ob;
+
+	if (k->k.size > ob->sectors_free)
+		bch_key_resize(&k->k, ob->sectors_free);
+
+	bch_alloc_sectors_done(c, wp, k, ob, k->k.size);
 
 	return ob;
 }
diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h
index 0ab405a19da9..c0118db8440e 100644
--- a/drivers/md/bcache/alloc.h
+++ b/drivers/md/bcache/alloc.h
@@ -16,6 +16,12 @@ void bch_prio_timer_start(struct cache_set *, int);
 
 void bch_open_bucket_put(struct cache_set *, struct open_bucket *);
 
+struct open_bucket *bch_alloc_sectors_start(struct cache_set *,
+					    struct write_point *,
+					    bool, struct closure *);
+void bch_alloc_sectors_done(struct cache_set *, struct write_point *,
+			    struct bkey_i *, struct open_bucket *, unsigned);
+
 struct open_bucket *bch_alloc_sectors(struct cache_set *, struct write_point *,
 				      struct bkey_i *, bool, struct closure *);
 
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 807278e80500..a160f5946c6e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -213,6 +213,7 @@
 #include "blockdev_types.h"
 #include "buckets_types.h"
 #include "clock_types.h"
+#include "io_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
 #include "keybuf_types.h"
@@ -418,8 +419,6 @@ struct cache_set {
 	struct closure		sb_write;
 	struct semaphore	sb_write_mutex;
 
-	struct bio_set		bio_split;
-
 	struct backing_dev_info bdi;
 
 	/* BTREE CACHE */
@@ -563,7 +562,13 @@ struct cache_set {
 	struct rw_semaphore	gc_lock;
 
 	/* IO PATH */
+	struct bio_set		bio_read;
 	struct bio_set		bio_write;
+	struct mutex		bio_bounce_pages_lock;
+	mempool_t		bio_bounce_pages;
+	mempool_t		compression_workspace_pool;
+	struct bio_decompress_worker __percpu
+				*bio_decompress_worker;
 
 	/* For punting bio submissions to workqueue, io.c */
 	struct bio_list		bio_submit_list;
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 1cb60b65322a..4d8fb84d8c55 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -324,10 +324,30 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n,
 
 /* Bkey utility code */
 
-#define BKEY_EXTENT_PTRS_MAX	4
-#define BKEY_EXTENT_MAX_U64s	(BKEY_U64s + BKEY_EXTENT_PTRS_MAX)
+/* Amount of space we might need, in order to add a single pointer */
 
-#define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_PTRS_MAX)
+/* XXX: move constants to uapi/linux/bcache.h */
+
+#define BKEY_EXTENT_PTR_MAX_U64s				\
+	((sizeof(struct bch_extent_crc64) +			\
+	  sizeof(struct bch_extent_ptr)) / sizeof(u64))
+
+#define BKEY_EXTENT_PTRS_MAX	        4
+
+#if 0
+#define BKEY_EXTENT_VAL_U64s_MAX				\
+	((sizeof(struct bch_extent_crc64) +
+	  sizeof(struct bch_extent_ptr)) * BKEY_EXTENT_PTRS_MAX)
+#else
+#define BKEY_EXTENT_VAL_U64s_MAX	8
+#endif
+
+#define BKEY_EXTENT_MAX_U64s	(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+#define BKEY_BTREE_PTR_VAL_U64s_MAX	BKEY_EXTENT_PTRS_MAX
+#define BKEY_BTREE_PTR_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_PTRS_MAX)
+
+#define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
 
 #define __bkey_idx(_set, _offset)				\
 	((_set)->_data + (_offset))
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index eac7354c572f..e422c4b2b0e2 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -335,6 +335,12 @@ static void bch_btree_init_next(struct cache_set *c, struct btree *b,
 
 /* Btree IO */
 
+/*
+ * We seed the checksum with the entire first pointer (dev, gen and offset),
+ * since for btree nodes we have to store the checksum with the data instead of
+ * the pointer - this helps guard against reading a valid btree node that is not
+ * the node we actually wanted:
+ */
 #define btree_csum_set(_b, _i)						\
 ({									\
 	void *_data = (void *) (_i) + 8;				\
@@ -573,7 +579,7 @@ static void bch_btree_node_read(struct cache_set *c, struct btree *b)
 	bch_bio_map(bio, b->data);
 
 	bio_get(bio);
-	bch_submit_bbio(to_bbio(bio), pick.ca, &b->key, &pick.ptr, true);
+	bch_submit_bbio(to_bbio(bio), pick.ca, &pick.ptr, true);
 
 	closure_sync(&cl);
 
@@ -2351,8 +2357,7 @@ struct btree_split_state {
 	 * pointers never have crc/compression info, so we only need to acount
 	 * for the pointers for three keys
 	 */
-	u64			inline_keys[(BKEY_U64s +
-					     BKEY_EXTENT_PTRS_MAX) * 3];
+	u64			inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
 	struct btree_reserve	*reserve;
 };
 
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index ec467b1b56d6..835d4bcf166b 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -101,7 +101,7 @@ struct btree {
 	struct rhash_head	hash;
 
 	/* Key/pointer for this btree node */
-	BKEY_PADDED(key);
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 
 	/* Single bit - set when accessed, cleared by shrinker */
 	unsigned long		accessed;
diff --git a/drivers/md/bcache/buckets.c b/drivers/md/bcache/buckets.c
index 3d92c5db96d3..3c96ce502c10 100644
--- a/drivers/md/bcache/buckets.c
+++ b/drivers/md/bcache/buckets.c
@@ -297,7 +297,7 @@ int bch_mark_pointers(struct cache_set *c, struct btree *b,
 		      struct bkey_s_c_extent e, int sectors,
 		      bool fail_if_stale, bool metadata)
 {
-	const struct bch_extent_ptr *ptr;
+	const struct bch_extent_ptr *ptr, *ptr2;
 	struct cache *ca;
 
 	BUG_ON(metadata && bkey_extent_is_cached(e.k));
@@ -350,11 +350,14 @@ int bch_mark_pointers(struct cache_set *c, struct btree *b,
 
 	return 0;
 stale:
-	while (--ptr >= e.v->ptr)
-		if ((ca = PTR_CACHE(c, ptr)))
-			bch_mark_bucket(c, ca, b, ptr, -sectors,
-					bch_extent_ptr_is_dirty(c, e, ptr),
-					metadata);
+	extent_for_each_online_device(c, e, ptr2, ca) {
+		if (ptr2 == ptr)
+			break;
+
+		bch_mark_bucket(c, ca, b, ptr, -sectors,
+				bch_extent_ptr_is_dirty(c, e, ptr),
+				metadata);
+	}
 	rcu_read_unlock();
 
 	return -1;
diff --git a/drivers/md/bcache/buckets.h b/drivers/md/bcache/buckets.h
index a2b8e479f80c..5e191ef91812 100644
--- a/drivers/md/bcache/buckets.h
+++ b/drivers/md/bcache/buckets.h
@@ -48,19 +48,20 @@ static inline size_t PTR_BUCKET_NR_TRACE(const struct cache_set *c,
 					 const struct bkey_i *k,
 					 unsigned ptr)
 {
-	const struct cache *ca;
 	size_t bucket = 0;
-
+#if 0
 	if (bkey_extent_is_data(&k->k)) {
-		const struct bkey_i_extent *e = bkey_i_to_extent_c(k);
-		const struct bch_extent_ptr *p = &e->v.ptr[ptr];
+		const struct bch_extent_ptr *ptr;
+		const struct cache *ca;
 
 		rcu_read_lock();
-		if ((ca = PTR_CACHE(c, p)))
-			bucket = PTR_BUCKET_NR(ca, p);
+		extent_for_each_online_device(c, bkey_i_to_s_c_extent(k), ptr, ca) {
+			bucket = PTR_BUCKET_NR(ca, ptr);
+			break;
+		}
 		rcu_read_unlock();
 	}
-
+#endif
 	return bucket;
 }
 
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index bb4d715c9b15..99d4657c4f4a 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -74,7 +74,7 @@ void bch_btree_verify(struct cache_set *c, struct btree *b)
 	bio->bi_end_io		= btree_verify_endio;
 	bch_bio_map(bio, n_sorted);
 
-	bch_submit_bbio(to_bbio(bio), pick.ca, &b->key, &pick.ptr, true);
+	bch_submit_bbio(to_bbio(bio), pick.ca, &pick.ptr, true);
 
 	closure_sync(&cl);
 	bio_put(bio);
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 140b7a9fed3f..4d5889d6d107 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -162,42 +162,104 @@ bool bch_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
 	return false;
 }
 
-static bool should_drop_ptr(const struct cache_set *c,
-			    struct bkey_s_c_extent e,
-			    const struct bch_extent_ptr *ptr)
+/* returns true if equal */
+static bool crc_cmp(union bch_extent_entry *l, union bch_extent_entry *r)
 {
-	struct cache *ca;
-	struct cache_member *mi;
+	return extent_entry_type(l) == extent_entry_type(r) &&
+		!memcmp(l, r, extent_entry_bytes(l));
+}
 
-	if (ptr->dev == PTR_LOST_DEV)
-		return false;
+/* Increment pointers after @crc by crc's offset until the next crc entry: */
+void extent_adjust_pointers(struct bkey_s_extent e, union bch_extent_entry *crc)
+{
+	union bch_extent_entry *entry;
+	unsigned offset = crc_to_64((void *) crc).offset;
 
-	if (ptr->dev >= c->sb.nr_in_set)
-		return true;
+	extent_for_each_entry_from(e, entry, extent_entry_next(crc)) {
+		if (!extent_entry_is_ptr(entry))
+			return;
 
-	mi = rcu_dereference(c->members)->m;
+		entry->ptr.offset += offset;
+	}
+}
 
-	if (bch_is_zero(mi[ptr->dev].uuid.b, sizeof(uuid_le)))
-		return true;
+static void extent_cleanup_crcs(struct bkey_s_extent e)
+{
+	union bch_extent_entry *crc = e.v->start, *prev = NULL;
 
-	if (bch_extent_ptr_is_dirty(c, e, ptr))
-		return false;
+	while (crc != extent_entry_last(e)) {
+		union bch_extent_entry *next = extent_entry_next(crc);
+		size_t crc_u64s = extent_entry_u64s(crc);
 
-	return (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr);
+		if (!extent_entry_is_crc(crc))
+			goto next;
+
+		if (next != extent_entry_last(e) &&
+		    extent_entry_is_crc(next)) {
+			/*
+			 * Two crc entries right after the other, the first one
+			 * doesn't have any pointers and we can just drop it:
+			 */
+			goto drop;
+		}
+
+		if (prev && crc_cmp(crc, prev)) {
+			/*
+			 * This crc entry is identical to the previous one, drop
+			 * it:
+			 */
+			goto drop;
+		}
+
+		if (!prev &&
+		    !crc_to_64((void *) crc).csum_type &&
+		    !crc_to_64((void *) crc).compression_type){
+			extent_adjust_pointers(e, crc);
+			goto drop;
+		}
+
+		prev = crc;
+next:
+		crc = next;
+		continue;
+drop:
+		memmove(crc, next,
+			(void *) extent_entry_last(e) - (void *) next);
+		e.k->u64s -= crc_u64s;
+	}
 }
 
-void bch_extent_drop_stale(struct cache_set *c, struct bkey_s k)
+void bch_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
 {
-	struct bkey_s_extent e = bkey_s_to_extent(k);
-	struct bch_extent_ptr *ptr;
+	__bch_extent_drop_ptr(e, ptr);
+	extent_cleanup_crcs(e);
+}
 
-	rcu_read_lock();
+static bool should_drop_ptr(const struct cache_set *c,
+			    struct bkey_s_c_extent e,
+			    const struct bch_extent_ptr *ptr)
+{
+	struct cache *ca;
+
+	return (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr);
+}
 
-	extent_for_each_ptr_backwards(e, ptr)
-		if (should_drop_ptr(c, e.c, ptr))
-			bch_extent_drop_ptr(e, ptr);
+void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent e)
+{
+	struct bch_extent_ptr *ptr = &e.v->start->ptr;
+	bool dropped = false;
 
+	rcu_read_lock();
+	while ((ptr = extent_ptr_next(e, ptr)))
+		if (should_drop_ptr(c, e.c, ptr)) {
+			__bch_extent_drop_ptr(e, ptr);
+			dropped = true;
+		} else
+			ptr++;
 	rcu_read_unlock();
+
+	if (dropped)
+		extent_cleanup_crcs(e);
 }
 
 static bool bch_ptr_normalize(struct btree_keys *bk, struct bkey_s k)
@@ -207,145 +269,123 @@ static bool bch_ptr_normalize(struct btree_keys *bk, struct bkey_s k)
 	return bch_extent_normalize(b->c, k);
 }
 
-/*
- * Common among btree pointers and normal data extents
- */
-static bool __ptr_invalid(const struct cache_set *c, struct bkey_s_c k)
+static const char *extent_ptr_invalid(const struct cache_member_rcu *mi,
+				      const struct bch_extent_ptr *ptr,
+				      unsigned size_ondisk)
 {
-	struct bkey_s_c_extent e;
-	const struct bch_extent_ptr *ptr;
-	struct cache_member *mi;
-	bool ret = true;
+	const struct cache_member *m = mi->m + ptr->dev;
 
-	if (k.k->u64s < BKEY_U64s)
-		return true;
+	if (ptr->dev == PTR_LOST_DEV) /* XXX: kill */
+		return NULL;
 
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		e = bkey_s_c_to_extent(k);
-
-		if (bch_extent_ptrs(e) > BKEY_EXTENT_PTRS_MAX)
-			return true;
+	if (ptr->dev > mi->nr_in_set ||
+	    bch_is_zero(m->uuid.b, sizeof(uuid_le)))
+		return "pointer to invalid device";
 
-		mi = cache_member_info_get(c)->m;
+	if (ptr->offset + size_ondisk > m->bucket_size * m->nbuckets)
+	    return "offset past end of device";
 
-		extent_for_each_ptr(e, ptr) {
-			struct cache_member *m = mi + ptr->dev;
-
-			if (ptr->dev > c->sb.nr_in_set) {
-				if (ptr->dev != PTR_LOST_DEV)
-					goto invalid;
-
-				continue;
-			}
+	if (ptr->offset < m->bucket_size * m->first_bucket)
+		return "offset before first bucket";
 
-			if ((ptr->offset + e.k->size >
-			     m->bucket_size * m->nbuckets) ||
-			    (ptr->offset <
-			     m->bucket_size * m->first_bucket) ||
-			    ((ptr->offset & (m->bucket_size - 1)) + e.k->size >
-			     m->bucket_size))
-				goto invalid;
-		}
-
-		ret = false;
-invalid:
-		cache_member_info_put();
-		break;
-	default:
-		return true;
-	}
+	if ((ptr->offset & (m->bucket_size - 1)) + size_ondisk > m->bucket_size)
+		return "spans multiple buckets";
 
-	return ret;
+	return NULL;
 }
 
-/*
- * Should match __extent_invalid() - returns the reason an extent is invalid
- */
-static const char *bch_ptr_status(const struct cache_set *c,
-				  struct cache_member *mi,
-				  struct bkey_s_c_extent e)
+static size_t extent_print_ptrs(struct cache_set *c, char *buf,
+				size_t size, struct bkey_s_c_extent e)
 {
+	char *out = buf, *end = buf + size;
+	const union bch_extent_entry *entry;
 	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc64 crc;
+	struct cache *ca;
+	bool first = true;
 
-	if (!bch_extent_ptrs(e))
-		return "invalid: no pointers";
-
-	if (bch_extent_ptrs(e) > BKEY_EXTENT_PTRS_MAX)
-		return "invalid: too many pointers";
-
-	extent_for_each_ptr(e, ptr) {
-		struct cache_member *m = mi + ptr->dev;
-		struct cache *ca;
-
-		if (ptr->dev > c->sb.nr_in_set) {
-			if (ptr->dev != PTR_LOST_DEV)
-				return "pointer to invalid device";
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
 
-			continue;
+	rcu_read_lock();
+	extent_for_each_entry(e, entry) {
+		if (!first)
+			p(" ");
+
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+			crc = crc_to_64((void *) entry);
+			p("crc: c_size %u size %u offset %u csum %u compress %u",
+			  crc.compressed_size, crc.uncompressed_size,
+			  crc.offset, crc.csum_type, crc.compression_type);
+			break;
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = &entry->ptr;
+			p("ptr: %u:%llu gen %u%s", ptr->dev,
+			  (u64) ptr->offset, ptr->gen,
+			  (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr)
+			  ? " stale" : "");
+			break;
 		}
 
-		if (ptr->offset + e.k->size > m->bucket_size * m->nbuckets)
-			return "invalid: offset past end of device";
-
-		if (ptr->offset < m->bucket_size * m->first_bucket)
-			return "invalid: offset before first bucket";
-
-		if ((ptr->offset & (m->bucket_size - 1)) +
-		    e.k->size > m->bucket_size)
-			return "invalid: spans multiple buckets";
-
-		if ((ca = PTR_CACHE(c, ptr)) &&
-		    ptr_stale(ca, ptr))
-			return "stale";
+		first = false;
 	}
+	rcu_read_unlock();
 
-	if (!e.k->size)
-		return "zeroed key";
-	return "";
+	if (bkey_extent_is_cached(e.k))
+		p(" cached");
+#undef p
+	return out - buf;
 }
 
-static void bch_extent_to_text(struct cache_set *c, char *buf,
-			       size_t size, struct bkey_s_c k)
+/* Btree ptrs */
+
+static const char *bch_btree_ptr_invalid_reason(const struct cache_set *c,
+						struct bkey_s_c k)
 {
-	struct bkey_s_c_extent e;
-	char *out = buf, *end = buf + size;
-	const struct bch_extent_ptr *ptr;
+	if (bkey_extent_is_cached(k.k))
+		return "cached";
 
-#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+	if (k.k->size)
+		return "nonzero key size";
 
-	if (bkey_extent_is_data(k.k)) {
-		e = bkey_s_c_to_extent(k);
+	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+		return "value too big";
 
-		extent_for_each_ptr(e, ptr) {
-			if (ptr != e.v->ptr)
-				p(", ");
+	switch (k.k->type) {
+	case BCH_EXTENT: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
+		const union bch_extent_crc *crc;
+		struct cache_member_rcu *mi = cache_member_info_get(c);
+		const char *reason;
 
-			p("%u:%llu gen %u", ptr->dev,
-			  (u64) ptr->offset, ptr->gen);
-		}
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			reason = extent_ptr_invalid(mi, ptr,
+						    CACHE_BTREE_NODE_SIZE(&c->sb));
 
-		if (bkey_extent_is_cached(e.k))
-			p(" cached");
-#if 0
-		if (KEY_CSUM(k))
-			p(" cs%llu %llx", KEY_CSUM(k), k->val[1]);
-#endif
+			if (reason) {
+				cache_member_info_put();
+				return reason;
+			}
+		}
 
-		p(" %s", bch_ptr_status(c, cache_member_info_get(c)->m, e));
 		cache_member_info_put();
+
+		if (crc)
+			return "has crc field";
+
+		return NULL;
 	}
-#undef p
-}
 
-/* Btree ptrs */
+	default:
+		return "invalid value type";
+	}
+}
 
 static bool bch_btree_ptr_invalid(const struct cache_set *c, struct bkey_s_c k)
 {
-	return bkey_extent_is_cached(k.k) ||
-		k.k->size ||
-		__ptr_invalid(c, k);
+	return bch_btree_ptr_invalid_reason(c, k);
 }
 
 static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b,
@@ -358,41 +398,49 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b,
 	char buf[160];
 	struct bucket *g;
 	struct cache *ca;
+	unsigned replicas = 0;
 	bool bad;
 
-	if (bch_extent_ptrs(e) < CACHE_SET_META_REPLICAS_HAVE(&c->sb)) {
-		bch_bkey_val_to_text(c, btree_node_type(b),
-				     buf, sizeof(buf), k);
-		cache_set_bug(c,
-			"btree key bad (too few replicas, %u < %llu): %s",
-			bch_extent_ptrs(e),
-			CACHE_SET_META_REPLICAS_HAVE(&c->sb),
-			buf);
+	if (bkey_extent_is_cached(k.k)) {
+		btree_bug(b, "btree ptr marked as cached");
 		return;
 	}
 
 	rcu_read_lock();
 
 	extent_for_each_online_device(c, e, ptr, ca) {
-		g = PTR_BUCKET(ca, ptr);
+		replicas++;
 
-		err = "stale";
-		if (ptr_stale(ca, ptr))
-			goto err;
+		if ((ca = PTR_CACHE(c, ptr))) {
+			g = PTR_BUCKET(ca, ptr);
 
-		do {
-			seq = read_seqcount_begin(&c->gc_cur_lock);
-			bad = (!__gc_will_visit_node(c, b) &&
-			       !g->mark.is_metadata);
-		} while (read_seqcount_retry(&c->gc_cur_lock, seq));
+			err = "stale";
+			if (ptr_stale(ca, ptr))
+				goto err;
 
-		err = "inconsistent";
-		if (bad)
-			goto err;
+			do {
+				seq = read_seqcount_begin(&c->gc_cur_lock);
+				bad = (!__gc_will_visit_node(c, b) &&
+				       !g->mark.is_metadata);
+			} while (read_seqcount_retry(&c->gc_cur_lock, seq));
+
+			err = "inconsistent";
+			if (bad)
+				goto err;
+		}
 	}
 
 	rcu_read_unlock();
 
+	if (replicas < CACHE_SET_META_REPLICAS_HAVE(&c->sb)) {
+		bch_bkey_val_to_text(c, btree_node_type(b),
+				     buf, sizeof(buf), k);
+		cache_set_bug(c,
+			"btree key bad (too few replicas, %u < %llu): %s",
+			replicas, CACHE_SET_META_REPLICAS_HAVE(&c->sb), buf);
+		return;
+	}
+
 	return;
 err:
 	bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
@@ -404,16 +452,43 @@ err:
 	rcu_read_unlock();
 }
 
+static void bch_btree_ptr_to_text(struct cache_set *c, char *buf,
+				  size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+	const char *invalid;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	if (bkey_extent_is_data(k.k))
+		out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+
+	invalid = bch_btree_ptr_invalid_reason(c, k);
+	if (invalid)
+		p(" invalid: %s", invalid);
+#undef p
+}
+
 struct extent_pick_ptr
 bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
 {
 	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
+	union bch_extent_crc *crc;
 	const struct bch_extent_ptr *ptr;
 	struct cache *ca;
 
 	rcu_read_lock();
 
-	extent_for_each_online_device(c, e, ptr, ca) {
+	extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+		if (crc) {
+			bch_cache_error(ca,
+				"btree node pointer with crc at btree %u level %u/%u bucket %zu",
+				b->btree_id, b->level, btree_node_root(b)
+				? btree_node_root(b)->level : -1,
+				PTR_BUCKET_NR(ca, ptr));
+			break;
+		}
+
 		if (ptr_stale(ca, ptr)) {
 			bch_cache_error(ca,
 				"stale btree node pointer at btree %u level %u/%u bucket %zu",
@@ -440,7 +515,7 @@ const struct btree_keys_ops bch_btree_interior_node_ops = {
 const struct bkey_ops bch_bkey_btree_ops = {
 	.key_invalid	= bch_btree_ptr_invalid,
 	.key_debugcheck	= btree_ptr_debugcheck,
-	.val_to_text	= bch_extent_to_text,
+	.val_to_text	= bch_btree_ptr_to_text,
 };
 
 /* Extents */
@@ -467,9 +542,24 @@ bool __bch_cut_front(struct bpos where, struct bkey_s k)
 	else if (bkey_extent_is_data(k.k)) {
 		struct bkey_s_extent e = bkey_s_to_extent(k);
 		struct bch_extent_ptr *ptr;
-
-		extent_for_each_ptr(e, ptr)
-			ptr->offset += e.k->size - len;
+		union bch_extent_crc *crc, *prev_crc = NULL;
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			switch (bch_extent_crc_type(crc)) {
+			case BCH_EXTENT_CRC_NONE:
+				ptr->offset += e.k->size - len;
+				break;
+			case BCH_EXTENT_CRC32:
+				if (prev_crc != crc)
+					crc->crc32.offset += e.k->size - len;
+				break;
+			case BCH_EXTENT_CRC64:
+				if (prev_crc != crc)
+					crc->crc64.offset += e.k->size - len;
+				break;
+			}
+			prev_crc = crc;
+		}
 	}
 
 	k.k->size = len;
@@ -765,8 +855,8 @@ static void bch_drop_subtract(struct cache_set *c, struct btree *b,
 static bool bkey_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r)
 {
 	struct bkey_s_c_extent le, re;
+	const struct bch_extent_ptr *lp, *rp;
 	s64 offset;
-	unsigned i;
 
 	BUG_ON(!l.k->size || !r.k->size);
 
@@ -803,12 +893,17 @@ static bool bkey_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r)
 		 * pointer
 		 */
 
-		if (bch_extent_ptrs(le) != bch_extent_ptrs(re))
+		if (bkey_val_u64s(le.k) != bkey_val_u64s(re.k))
 			return false;
 
-		for (i = 0; i < bch_extent_ptrs(le); i++) {
-			const struct bch_extent_ptr *lp = le.v->ptr + i;
-			const struct bch_extent_ptr *rp = re.v->ptr + i;
+		extent_for_each_ptr(le, lp) {
+			const union bch_extent_entry *entry =
+				bkey_idx(re.v, (u64 *) lp - le.v->_data);
+
+			if (!extent_entry_is_ptr(entry))
+				return false;
+
+			rp = &entry->ptr;
 
 			if (lp->offset	!= rp->offset + offset ||
 			    lp->dev	!= rp->dev ||
@@ -1211,10 +1306,63 @@ out:
 	return inserted;
 }
 
+static const char *bch_extent_invalid_reason(const struct cache_set *c,
+					     struct bkey_s_c k)
+{
+	if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+		return "value too big";
+
+	if (!k.k->size)
+		return "zero key size";
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		struct bch_extent_crc64 crc64;
+		struct cache_member_rcu *mi = cache_member_info_get(c);
+		unsigned size_ondisk = e.k->size;
+		const char *reason;
+
+		extent_for_each_entry(e, entry)
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_crc32:
+			case BCH_EXTENT_ENTRY_crc64:
+				crc64 = crc_to_64((void *) entry);
+
+				reason = "checksum uncompressed size < key size";
+				if (crc64.uncompressed_size < e.k->size)
+					goto invalid;
+
+				reason = "checksum offset > uncompressed size";
+				if (crc64.offset >= crc64.uncompressed_size)
+					goto invalid;
+
+				size_ondisk = crc64.compressed_size;
+				break;
+			case BCH_EXTENT_ENTRY_ptr:
+				reason = extent_ptr_invalid(mi, &entry->ptr, size_ondisk);
+				if (reason)
+					goto invalid;
+				break;
+			}
+
+		cache_member_info_put();
+		return NULL;
+invalid:
+		cache_member_info_put();
+		return reason;
+	}
+
+	default:
+		return "invalid value type";
+	}
+}
+
 static bool bch_extent_invalid(const struct cache_set *c, struct bkey_s_c k)
 {
-	return (bkey_extent_is_data(k.k) && !k.k->size) ||
-		__ptr_invalid(c, k);
+	return bch_extent_invalid_reason(c, k);
 }
 
 static void bch_extent_debugcheck(struct cache_set *c, struct btree *b,
@@ -1229,27 +1377,17 @@ static void bch_extent_debugcheck(struct cache_set *c, struct btree *b,
 	char buf[160];
 	bool bad;
 	unsigned ptrs_per_tier[CACHE_TIERS];
-	unsigned i, tier, replicas;
+	unsigned i, tier, replicas = 0;
 
 	memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
 
-	if (!bkey_extent_is_cached(e.k) &&
-	    bch_extent_ptrs(e) < CACHE_SET_DATA_REPLICAS_HAVE(&c->sb)) {
-		bch_bkey_val_to_text(c, btree_node_type(b),
-				     buf, sizeof(buf), k);
-		cache_set_bug(c,
-			"extent key bad (too few replicas, %u < %llu): %s",
-			bch_extent_ptrs(e),
-			CACHE_SET_DATA_REPLICAS_HAVE(&c->sb),
-			buf);
-		return;
-	}
-
 	mi = cache_member_info_get(c);
 
 	extent_for_each_ptr(e, ptr) {
 		bool dirty = bch_extent_ptr_is_dirty(c, e, ptr);
 
+		replicas++;
+
 		/* Could be a special pointer such as PTR_CHECK_DEV */
 		if (ptr->dev >= mi->nr_in_set) {
 			if (ptr->dev != PTR_LOST_DEV)
@@ -1299,55 +1437,165 @@ static void bch_extent_debugcheck(struct cache_set *c, struct btree *b,
 				goto bad_ptr;
 		}
 	}
+	cache_member_info_put();
+
+	if (!bkey_extent_is_cached(e.k) &&
+	    replicas < CACHE_SET_DATA_REPLICAS_HAVE(&c->sb)) {
+		bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
+		cache_set_bug(c,
+			"extent key bad (too few replicas, %u < %llu): %s",
+			replicas, CACHE_SET_DATA_REPLICAS_HAVE(&c->sb), buf);
+		return;
+	}
 
-	replicas = CACHE_SET_DATA_REPLICAS_WANT(&c->sb);
+	/*
+	 * XXX: _why_ was this added?
+	 */
 	for (i = 0; i < CACHE_TIERS; i++)
-		if (ptrs_per_tier[i] > replicas) {
-			bch_bkey_val_to_text(c, btree_node_type(b),
-					     buf, sizeof(buf), k);
+		if (ptrs_per_tier[i] > CACHE_SET_DATA_REPLICAS_WANT(&c->sb)) {
+			bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
 			cache_set_bug(c,
 				      "extent key bad (too many tier %u replicas): %s",
 				      i, buf);
 			break;
 		}
 
-	cache_member_info_put();
 	return;
 
 bad_device:
 	bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
-	cache_set_bug(c, "extent pointer %u device missing: %s",
-		      (unsigned) (ptr - e.v->ptr), buf);
+	cache_set_bug(c, "extent pointer to dev %u missing device: %s",
+		      ptr->dev, buf);
 	cache_member_info_put();
 	return;
 
 bad_ptr:
 	bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
-	cache_set_bug(c, "extent pointer %u bad gc mark: %s:\nbucket %zu prio %i "
+	cache_set_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i "
 		      "gen %i last_gc %i mark 0x%08x",
-		      (unsigned) (ptr - e.v->ptr), buf, PTR_BUCKET_NR(ca, ptr),
+		      buf, PTR_BUCKET_NR(ca, ptr),
 		      g->read_prio, PTR_BUCKET_GEN(ca, ptr),
 		      g->oldest_gen, g->mark.counter);
 	cache_member_info_put();
 	return;
 }
 
+static void bch_extent_to_text(struct cache_set *c, char *buf,
+			       size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+	const char *invalid;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	if (bkey_extent_is_data(k.k))
+		out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+
+	invalid = bch_extent_invalid_reason(c, k);
+	if (invalid)
+		p(" invalid: %s", invalid);
+#undef p
+}
+
 static unsigned PTR_TIER(struct cache_member_rcu *mi,
-			 const struct bch_extent *e,
-			 unsigned ptr)
+			 const struct bch_extent_ptr *ptr)
+{
+	return ptr->dev < mi->nr_in_set
+		? CACHE_TIER(&mi->m[ptr->dev])
+		: UINT_MAX;
+}
+
+static void __extent_sort_ptrs(struct cache_member_rcu *mi,
+			       struct bkey_s_extent src)
+{
+	struct bch_extent_ptr *src_ptr, *dst_ptr;
+	union bch_extent_entry *src_crc, *dst_crc;
+	BKEY_PADDED(k) tmp;
+	struct bkey_s_extent dst;
+	size_t u64s, crc_u64s;
+	u64 *p;
+
+	/*
+	 * Insertion sort:
+	 *
+	 * Note: this sort needs to be stable, because pointer order determines
+	 * pointer dirtyness.
+	 */
+
+	tmp.k.k = *src.k;
+	dst = bkey_i_to_s_extent(&tmp.k);
+	set_bkey_val_u64s(dst.k, 0);
+
+	extent_for_each_ptr_crc(src, src_ptr, src_crc) {
+		extent_for_each_ptr_crc(dst, dst_ptr, dst_crc)
+			if (PTR_TIER(mi, src_ptr) < PTR_TIER(mi, dst_ptr))
+				break;
+
+		/* found insert position: */
+
+		/*
+		 * we're making sure everything has a crc at this point, if
+		 * dst_ptr points to a pointer it better have a crc:
+		 */
+		BUG_ON(dst_ptr != &extent_entry_last(dst)->ptr && !dst_crc);
+		BUG_ON(dst_crc && extent_entry_next(dst_crc) != (void *) dst_ptr);
+
+		p = dst_ptr != &extent_entry_last(dst)->ptr
+			? (void *) dst_crc
+			: (void *) dst_ptr;
+
+		if (!src_crc)
+			src_crc = (void *) &((struct bch_extent_crc32) {
+				.type			= 1 << BCH_EXTENT_ENTRY_crc32,
+				.compressed_size	= src.k->size,
+				.uncompressed_size	= src.k->size,
+				.offset			= 0,
+				.compression_type	= BCH_COMPRESSION_NONE,
+				.csum_type		= BCH_CSUM_NONE,
+				.csum			= 0,
+			});
+
+		crc_u64s = extent_entry_u64s((void *) src_crc);
+		u64s = crc_u64s + sizeof(*dst_ptr) / sizeof(u64);
+
+		memmove(p + u64s, p,
+			(void *) extent_entry_last(dst) - (void *) p);
+		set_bkey_val_u64s(dst.k, bkey_val_u64s(dst.k) + u64s);
+
+		memcpy(p, src_crc, crc_u64s * sizeof(u64));
+		memcpy(p + crc_u64s, src_ptr, sizeof(*src_ptr));
+	}
+
+	/* Sort done - now drop redundant crc entries: */
+	extent_cleanup_crcs(dst);
+
+	memcpy(src.v, dst.v, bkey_val_bytes(dst.k));
+	set_bkey_val_u64s(src.k, bkey_val_u64s(dst.k));
+}
+
+static void extent_sort_ptrs(struct cache_set *c, struct bkey_s_extent e)
 {
-	unsigned dev = e->ptr[ptr].dev;
+	struct cache_member_rcu *mi;
+	struct bch_extent_ptr *ptr, *prev = NULL;
+	union bch_extent_crc *crc;
+
+	mi = cache_member_info_get(c);
+
+	extent_for_each_ptr_crc(e, ptr, crc)
+		if (prev &&
+		    PTR_TIER(mi, ptr) < PTR_TIER(mi, prev)) {
+			__extent_sort_ptrs(mi, e);
+			break;
+		}
 
-	return dev < mi->nr_in_set ? CACHE_TIER(&mi->m[dev]) : UINT_MAX;
+	cache_member_info_put();
 }
 
 bool bch_extent_normalize(struct cache_set *c, struct bkey_s k)
 {
 	struct bkey_s_extent e;
 	struct bch_extent_ptr *ptr;
-	struct cache_member_rcu *mi;
-	unsigned i;
-	bool swapped, have_data = false;
+	bool have_data = false;
 
 	switch (k.k->type) {
 	case KEY_TYPE_ERROR:
@@ -1364,31 +1612,15 @@ bool bch_extent_normalize(struct cache_set *c, struct bkey_s k)
 	case BCH_EXTENT_CACHED:
 		e = bkey_s_to_extent(k);
 
-
-		bch_extent_drop_stale(c, k);
-
-		mi = cache_member_info_get(c);
-
-		/* Bubble sort pointers by tier, lowest (fastest) tier first */
-		do {
-			swapped = false;
-			for (i = 0; i + 1 < bch_extent_ptrs(e); i++) {
-				if (PTR_TIER(mi, e.v, i) >
-				    PTR_TIER(mi, e.v, i + 1)) {
-					swap(e.v->ptr[i], e.v->ptr[i + 1]);
-					swapped = true;
-				}
-			}
-		} while (swapped);
-
-		cache_member_info_put();
+		bch_extent_drop_stale(c, e);
+		extent_sort_ptrs(c, e);
 
 		extent_for_each_ptr(e, ptr)
 			if (ptr->dev != PTR_LOST_DEV)
 				have_data = true;
 
 		if (!have_data) {
-			bch_set_extent_ptrs(e, 0);
+			set_bkey_val_u64s(e.k, 0);
 			if (bkey_extent_is_cached(e.k)) {
 				k.k->type = KEY_TYPE_DISCARD;
 				if (!k.k->version)
@@ -1417,6 +1649,7 @@ bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
 			     struct cache *avoid)
 {
 	struct bkey_s_c_extent e;
+	const union bch_extent_crc *crc;
 	const struct bch_extent_ptr *ptr;
 	struct cache *ca;
 	struct extent_pick_ptr ret = { .ca = NULL };
@@ -1439,9 +1672,10 @@ bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
 		e = bkey_s_c_to_extent(k);
 		rcu_read_lock();
 
-		extent_for_each_online_device(c, e, ptr, ca)
+		extent_for_each_online_device_crc(c, e, crc, ptr, ca)
 			if (!ptr_stale(ca, ptr)) {
 				ret = (struct extent_pick_ptr) {
+					.crc = crc_to_64(crc),
 					.ptr = *ptr,
 					.ca = ca,
 				};
@@ -1469,7 +1703,7 @@ static enum merge_result bch_extent_merge(struct btree_keys *bk,
 	struct btree *b = container_of(bk, struct btree, keys);
 	struct cache_set *c = b->c;
 	struct bkey_s_extent el, er;
-	unsigned i;
+	union bch_extent_entry *en_l, *en_r;
 
 	if (key_merging_disabled(c))
 		return BCH_MERGE_NOMERGE;
@@ -1498,11 +1732,20 @@ static enum merge_result bch_extent_merge(struct btree_keys *bk,
 		el = bkey_i_to_s_extent(l);
 		er = bkey_i_to_s_extent(r);
 
-		for (i = 0; i < bch_extent_ptrs(el); i++) {
-			struct bch_extent_ptr *lp = el.v->ptr + i;
-			struct bch_extent_ptr *rp = er.v->ptr + i;
+		extent_for_each_entry(el, en_l) {
+			struct bch_extent_ptr *lp, *rp;
 			struct cache_member *m;
 
+			en_r = bkey_idx(er.v, (u64 *) en_l - el.v->_data);
+
+			if ((extent_entry_type(en_l) !=
+			     extent_entry_type(en_r)) ||
+			    extent_entry_is_crc(en_l))
+				return BCH_MERGE_NOMERGE;
+
+			lp = &en_l->ptr;
+			rp = &en_r->ptr;
+
 			if (lp->offset + el.k->size	!= rp->offset ||
 			    lp->dev			!= rp->dev ||
 			    lp->gen			!= rp->gen)
@@ -1533,14 +1776,7 @@ static enum merge_result bch_extent_merge(struct btree_keys *bk,
 		bch_cut_front(l->k.p, r);
 		return BCH_MERGE_PARTIAL;
 	}
-#if 0
-	if (KEY_CSUM(l)) {
-		if (KEY_CSUM(r))
-			l->val[bch_extent_ptrs(l)] = merge_chksums(l, r);
-		else
-			SET_KEY_CSUM(l, 0);
-	}
-#endif
+
 	bch_key_resize(&l->k, l->k.size + r->k.size);
 
 	return BCH_MERGE_MERGE;
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
index ad6bcdf185ad..08c039bd0869 100644
--- a/drivers/md/bcache/extents.h
+++ b/drivers/md/bcache/extents.h
@@ -3,7 +3,10 @@
 
 #include "bkey.h"
 
+#include <linux/bcache.h>
+
 struct bch_replace_info;
+union bch_extent_crc;
 
 struct btree_nr_keys bch_key_sort_fix_overlapping(struct btree_keys *,
 						  struct bset *,
@@ -31,6 +34,7 @@ struct cache_set;
 struct journal_res;
 
 struct extent_pick_ptr {
+	struct bch_extent_crc64		crc;
 	struct bch_extent_ptr		ptr;
 	struct cache			*ca;
 };
@@ -53,7 +57,7 @@ bool bch_insert_fixup_extent(struct cache_set *, struct btree *,
 			     struct bch_replace_info *, struct bpos *,
 			     struct journal_res *, unsigned);
 
-void bch_extent_drop_stale(struct cache_set *c, struct bkey_s);
+void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent);
 bool bch_extent_normalize(struct cache_set *, struct bkey_s);
 
 static inline bool bkey_extent_is_data(const struct bkey *k)
@@ -80,69 +84,269 @@ static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
 	k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
 }
 
-#define bch_extent_ptrs(_e)	bkey_val_u64s((_e).k)
+static inline enum bch_extent_entry_type
+extent_entry_type(const union bch_extent_entry *e)
+{
+	int ret = __ffs(e->type);
+
+	EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
 
-static inline void bch_set_extent_ptrs(struct bkey_s_extent e, unsigned i)
+	return ret;
+}
+
+static inline size_t __extent_entry_bytes(enum bch_extent_entry_type type)
 {
-	BUG_ON(i > BKEY_EXTENT_PTRS_MAX);
-	set_bkey_val_u64s(e.k, i);
+	switch (type) {
+	case BCH_EXTENT_ENTRY_crc32:
+		return sizeof(struct bch_extent_crc32);
+	case BCH_EXTENT_ENTRY_crc64:
+		return sizeof(struct bch_extent_crc64);
+	case BCH_EXTENT_ENTRY_ptr:
+		return sizeof(struct bch_extent_ptr);
+	default:
+		BUG();
+	}
 }
 
-static inline void bch_extent_drop_ptr(struct bkey_s_extent e,
-				       struct bch_extent_ptr *ptr)
+static inline size_t __extent_entry_u64s(enum bch_extent_entry_type type)
 {
-	BUG_ON(ptr < e.v->ptr ||
-	       ptr >= e.v->ptr + bch_extent_ptrs(e.c));
+	return __extent_entry_bytes(type) / sizeof(u64);
+}
 
-	memmove(ptr, ptr + 1,
-		(void *) (e.v->ptr + bch_extent_ptrs(e.c)) -
-		(void *) (ptr + 1));
-	e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+static inline size_t extent_entry_bytes(const union bch_extent_entry *e)
+{
+	return __extent_entry_bytes(extent_entry_type(e));
 }
 
-static inline bool bch_extent_ptr_is_dirty(const struct cache_set *c,
-					   struct bkey_s_c_extent e,
-					   const struct bch_extent_ptr *ptr)
+static inline size_t extent_entry_u64s(const union bch_extent_entry *e)
 {
-	/* Dirty pointers come last */
+	return extent_entry_bytes(e) / sizeof(u64);
+}
 
-	if (bkey_extent_is_cached(e.k))
-		return false;
+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
+{
+	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
 
-	return ptr + CACHE_SET_DATA_REPLICAS_WANT(&c->sb) >=
-		e.v->ptr + bch_extent_ptrs(e);
+static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
+{
+	return !extent_entry_is_ptr(e);
 }
 
-#define extent_for_each_ptr(_extent, _ptr)				\
-	for ((_ptr) = (_extent).v->ptr;					\
-	     (_ptr) < (_extent).v->ptr + bch_extent_ptrs(_extent);	\
-	     (_ptr)++)
+union bch_extent_crc {
+	u8				type;
+	struct bch_extent_crc32		crc32;
+	struct bch_extent_crc64		crc64;
+};
 
-/*
- * Use this when you'll be dropping pointers as you iterate.
- * Any reason we shouldn't just always do this?
- */
-#define extent_for_each_ptr_backwards(_extent, _ptr)			\
-	for ((_ptr) = (_extent).v->ptr + bch_extent_ptrs(_extent) - 1;	\
-	     (_ptr) >= (_extent).v->ptr;				\
-	     --(_ptr))
+enum bch_extent_crc_type {
+	BCH_EXTENT_CRC_NONE,
+	BCH_EXTENT_CRC32,
+	BCH_EXTENT_CRC64,
+};
+
+static inline enum bch_extent_crc_type
+bch_extent_crc_type(const union bch_extent_crc *crc)
+{
+	if (!crc)
+		return BCH_EXTENT_CRC_NONE;
+
+	switch (extent_entry_type((void *) crc)) {
+	case BCH_EXTENT_ENTRY_crc32:
+		return BCH_EXTENT_CRC32;
+	case BCH_EXTENT_ENTRY_crc64:
+		return BCH_EXTENT_CRC64;
+	default:
+		BUG();
+	}
+}
+
+#define extent_entry_next(_entry)					\
+	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
+#define extent_entry_last(_e)						\
+	bkey_idx((_e).v, bkey_val_u64s((_e).k))
+
+#define extent_for_each_entry_from(_e, _entry, _start)			\
+	for ((_entry) = _start;						\
+	     (_entry) < extent_entry_last(_e);				\
+	     (_entry) = extent_entry_next(_entry))
+
+#define extent_for_each_entry(_e, _entry)				\
+	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+/* Iterates through entries until it hits a pointer: */
+#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter)		\
+({									\
+	__label__ out;							\
+	const union bch_extent_entry *_entry;				\
+									\
+	extent_for_each_entry_from(_e, _entry, (void *) _ptr)		\
+		if (extent_entry_is_crc(_entry)) {			\
+			(_crc) = (void *) _entry;			\
+		} else {						\
+			_ptr = (typeof(_ptr)) &_entry->ptr;		\
+			if (_filter)					\
+				goto out;				\
+		}							\
+									\
+	_ptr = NULL;							\
+out:									\
+	_ptr;								\
+})
 
-#define __extent_ptr_next_online_device(_c, _extent, _ptr, _ca)		\
+#define extent_ptr_next_filter(_e, _ptr, _filter)			\
 ({									\
-	(_ca) = NULL;							\
+	union bch_extent_crc *_crc;					\
 									\
-	while ((_ptr) < (_extent).v->ptr + bch_extent_ptrs(_extent) &&	\
-	       !((_ca) = PTR_CACHE(_c, _ptr)))				\
-		(_ptr)++;						\
-	(_ca);								\
+	extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter);		\
 })
 
-#define extent_for_each_online_device(_c, _extent, _ptr, _ca)		\
-	for ((_ptr) = (_extent).v->ptr;					\
-	     ((_ca) = __extent_ptr_next_online_device(_c, _extent,	\
-						      _ptr, _ca));	\
+#define extent_ptr_crc_next(_e, _crc, _ptr)				\
+	extent_ptr_crc_next_filter(_e, _crc, _ptr, true)
+
+#define extent_ptr_next(_e, _ptr)					\
+	extent_ptr_next_filter(_e, _ptr, true)
+
+#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter)		\
+	for ((_crc) = NULL,						\
+	     (_ptr) = &(_e).v->start->ptr;				\
+	     ((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\
 	     (_ptr)++)
 
+#define extent_for_each_ptr_from_filter(_e, _ptr, _start, _filter)	\
+	for ((_ptr) = (_start);				\
+	     ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter));	\
+	     (_ptr)++)
+
+#define extent_for_each_ptr_filter(_e, _ptr, _filter)			\
+	extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr, _filter)
+
+#define extent_for_each_ptr_crc(_e, _ptr, _crc)				\
+	extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true)
+
+#define extent_for_each_ptr_from(_e, _ptr, _start)			\
+	extent_for_each_ptr_from_filter(_e, _ptr, _start, true)
+
+#define extent_for_each_ptr(_e, _ptr)					\
+	extent_for_each_ptr_filter(_e, _ptr, true)
+
+#define extent_for_each_online_device_crc(_c, _e, _crc, _ptr, _ca)	\
+	extent_for_each_ptr_crc_filter(_e, _ptr, _crc,			\
+				       ((_ca) = PTR_CACHE(_c, _ptr)))
+
+#define extent_for_each_online_device(_c, _e, _ptr, _ca)		\
+	extent_for_each_ptr_filter(_e, _ptr,				\
+				   ((_ca) = PTR_CACHE(_c, _ptr)))
+
+#define extent_ptr_prev(_e, _ptr)					\
+({									\
+	typeof(&(_e).v->start->ptr) _p;					\
+	typeof(&(_e).v->start->ptr) _prev = NULL;			\
+									\
+	extent_for_each_ptr(_e, _p) {					\
+		if (_p == (_ptr))					\
+			break;						\
+		_prev = _p;						\
+	}								\
+									\
+	_prev;								\
+})
+
+/*
+ * Use this when you'll be dropping pointers as you iterate. Quadratic,
+ * unfortunately:
+ */
+#define extent_for_each_ptr_backwards(_e, _ptr)				\
+	for ((_ptr) = extent_ptr_prev(_e, NULL);			\
+	     (_ptr);							\
+	     (_ptr) = extent_ptr_prev(_e, _ptr))
+
+/*
+ * make sure the type field gets set correctly:
+ */
+#define __extent_entry_append(_e, _type, _val)				\
+do {									\
+	union bch_extent_entry *_new =					\
+		extent_entry_last(extent_i_to_s((_e)));			\
+									\
+	(_e)->k.u64s += __extent_entry_u64s(BCH_EXTENT_ENTRY_##_type);	\
+	BUG_ON(bkey_val_u64s(&(_e)->k) > BKEY_EXTENT_VAL_U64s_MAX);	\
+									\
+	_new->_type = _val;						\
+	_new->_type.type = 1 << BCH_EXTENT_ENTRY_##_type;		\
+									\
+	BUG_ON(extent_entry_type(_new) != BCH_EXTENT_ENTRY_##_type);	\
+} while (0)
+
+static inline void extent_crc32_append(struct bkey_i_extent *e,
+				       struct bch_extent_crc32 crc)
+{
+	__extent_entry_append(e, crc32, crc);
+}
+
+static inline void extent_crc64_append(struct bkey_i_extent *e,
+				       struct bch_extent_crc64 crc)
+{
+	__extent_entry_append(e, crc64, crc);
+}
+
+static inline void extent_ptr_append(struct bkey_i_extent *e,
+				     struct bch_extent_ptr ptr)
+{
+	__extent_entry_append(e, ptr, ptr);
+}
+
+/* XXX: inefficient */
+static inline bool bch_extent_ptr_is_dirty(const struct cache_set *c,
+					   struct bkey_s_c_extent e,
+					   const struct bch_extent_ptr *ptr)
+{
+	const struct bch_extent_ptr *i;
+	unsigned seen = 0;
+
+	if (bkey_extent_is_cached(e.k))
+		return false;
+
+	/* Dirty pointers come last */
+	extent_for_each_ptr_from(e, i, ptr)
+		seen++;
+
+	return seen <= CACHE_SET_DATA_REPLICAS_WANT(&c->sb);
+}
+
+static inline struct bch_extent_crc64 crc_to_64(const union bch_extent_crc *crc)
+{
+	switch (bch_extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return (struct bch_extent_crc64) { 0 };
+	case BCH_EXTENT_CRC32:
+		return (struct bch_extent_crc64) {
+			.compressed_size	= crc->crc32.compressed_size,
+			.uncompressed_size	= crc->crc32.uncompressed_size,
+			.offset			= crc->crc32.offset,
+			.csum_type		= crc->crc32.csum_type,
+			.compression_type	= crc->crc32.compression_type,
+			.csum			= crc->crc32.csum,
+		};
+	case BCH_EXTENT_CRC64:
+		return crc->crc64;
+	default:
+		BUG();
+	}
+}
+
+void extent_adjust_pointers(struct bkey_s_extent, union bch_extent_entry *);
+
+/* Doesn't cleanup redundant crcs */
+static inline void __bch_extent_drop_ptr(struct bkey_s_extent e,
+					 struct bch_extent_ptr *ptr)
+{
+	memmove(ptr, ptr + 1, (void *) extent_entry_last(e) - (void *) (ptr + 1));
+	e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+}
+
+void bch_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
 bool bch_extent_has_device(struct bkey_s_c_extent, unsigned);
 
 bool bch_cut_front(struct bpos, struct bkey_i *);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index d543344be4ab..c0d17ad94623 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -21,6 +21,7 @@
 #include "super.h"
 
 #include <linux/blkdev.h>
+#include <linux/zlib.h>
 
 #include <trace/events/bcache.h>
 
@@ -61,29 +62,68 @@ void bch_bio_submit_work(struct work_struct *work)
 	}
 }
 
-/* Bios with headers */
+/* Allocate, free from mempool: */
 
-void bch_bbio_prep(struct bbio *b, struct cache *ca)
+void bch_bio_free_pages_pool(struct cache_set *c, struct bio *bio)
 {
-	struct bvec_iter *iter = &b->bio.bi_iter;
+	struct bio_vec *bv;
+	unsigned i;
+
+	bio_for_each_segment_all(bv, bio, i)
+		mempool_free(bv->bv_page, &c->bio_bounce_pages);
+}
+
+static void bch_bio_alloc_page_pool(struct cache_set *c, struct bio *bio,
+				    bool *using_mempool)
+{
+	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
+
+	if (likely(!*using_mempool)) {
+		bv->bv_page = alloc_page(GFP_NOIO);
+		if (unlikely(!bv->bv_page)) {
+			mutex_lock(&c->bio_bounce_pages_lock);
+			*using_mempool = true;
+			goto pool_alloc;
+
+		}
+	} else {
+pool_alloc:
+		bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+	}
+
+	bv->bv_len = PAGE_SIZE;
+	bv->bv_offset = 0;
+}
+
+static void bch_bio_alloc_pages_pool(struct cache_set *c, struct bio *bio,
+				     size_t bytes)
+{
+	bool using_mempool = false;
+
+	bio->bi_iter.bi_size = bytes;
+
+	while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
+		bch_bio_alloc_page_pool(c, bio, &using_mempool);
+
+	if (using_mempool)
+		mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+/* Bios with headers */
 
+static void bch_bbio_prep(struct bbio *b, struct cache *ca)
+{
 	b->ca				= ca;
 	b->bio.bi_iter.bi_sector	= b->ptr.offset;
 	b->bio.bi_bdev			= ca ? ca->disk_sb.bdev : NULL;
-
-	b->bi_idx			= iter->bi_idx;
-	b->bi_bvec_done			= iter->bi_bvec_done;
 }
 
-/* XXX: should be bkey, not bkey_i */
-void bch_submit_bbio(struct bbio *b, struct cache *ca, const struct bkey_i *k,
+void bch_submit_bbio(struct bbio *b, struct cache *ca,
 		     const struct bch_extent_ptr *ptr, bool punt)
 {
 	struct bio *bio = &b->bio;
 
-	b->key = *k;
 	b->ptr = *ptr;
-	bch_set_extent_ptrs(bkey_i_to_s_extent(&b->key), 1);
 	bch_bbio_prep(b, ca);
 	b->submit_time_us = local_clock_us();
 
@@ -100,27 +140,28 @@ void bch_submit_bbio_replicas(struct bch_write_bio *bio, struct cache_set *c,
 			      bool punt)
 {
 	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+	const struct bch_extent_ptr *ptr;
 	struct cache *ca;
-	unsigned ptr;
+	unsigned ptr_idx = 0;
 
 	BUG_ON(bio->orig);
 
-	for (ptr = ptrs_from;
-	     ptr < bch_extent_ptrs(e);
-	     ptr++) {
+	extent_for_each_ptr(e, ptr) {
+		if (ptr_idx++ < ptrs_from)
+			continue;
+
 		rcu_read_lock();
-		ca = PTR_CACHE(c, &e.v->ptr[ptr]);
+		ca = PTR_CACHE(c, ptr);
 		if (ca)
 			percpu_ref_get(&ca->ref);
 		rcu_read_unlock();
 
 		if (!ca) {
-			bch_submit_bbio(&bio->bio, ca, k,
-					&e.v->ptr[ptr], punt);
+			bch_submit_bbio(&bio->bio, ca, ptr, punt);
 			break;
 		}
 
-		if (ptr + 1 < bch_extent_ptrs(e)) {
+		if (ptr + 1 < &extent_entry_last(e)->ptr) {
 			struct bch_write_bio *n =
 				to_wbio(bio_clone_fast(&bio->bio.bio, GFP_NOIO,
 						       &ca->replica_set));
@@ -130,16 +171,17 @@ void bch_submit_bbio_replicas(struct bch_write_bio *bio, struct cache_set *c,
 			n->orig			= &bio->bio.bio;
 			__bio_inc_remaining(n->orig);
 
-			bch_submit_bbio(&n->bio, ca, k, &e.v->ptr[ptr], punt);
+			bch_submit_bbio(&n->bio, ca, ptr, punt);
 		} else {
-			bch_submit_bbio(&bio->bio, ca, k,
-					&e.v->ptr[ptr], punt);
+			bch_submit_bbio(&bio->bio, ca, ptr, punt);
 		}
 	}
 }
 
 static void bch_bbio_reset(struct bbio *b)
 {
+	BUG();
+#if 0
 	struct bvec_iter *iter = &b->bio.bi_iter;
 
 	bio_reset(&b->bio);
@@ -147,6 +189,7 @@ static void bch_bbio_reset(struct bbio *b)
 	iter->bi_size		= b->key.k.size << 9;
 	iter->bi_idx		= b->bi_idx;
 	iter->bi_bvec_done	= b->bi_bvec_done;
+#endif
 }
 
 /* IO errors */
@@ -268,25 +311,198 @@ static inline bool version_stress_test(struct cache_set *c)
 #endif
 }
 
-static void __bch_write(struct closure *);
-
-#if 0
-static void bio_csum(struct bio *bio, struct bkey *k)
+static u32 checksum_bio(struct bio *bio, unsigned type)
 {
 	struct bio_vec bv;
 	struct bvec_iter iter;
-	u64 crc = 0xffffffffffffffffULL;
+	u32 csum = U32_MAX;
+
+	if (type == BCH_CSUM_NONE)
+		return 0;
 
 	bio_for_each_segment(bv, bio, iter) {
-		void *d = kmap(bv.bv_page) + bv.bv_offset;
+		void *p = kmap_atomic(bv.bv_page);
 
-		crc = bch_checksum_update(KEY_CSUM(k), crc, d, bv.bv_len);
-		kunmap(bv.bv_page);
+		csum = bch_checksum_update(type, csum,
+					   p + bv.bv_offset,
+					   bv.bv_len);
+		kunmap_atomic(p);
 	}
 
-	k->val[bch_extent_ptrs(k)] = crc;
+	return csum ^= U32_MAX;
 }
-#endif
+
+static int bio_compress_gzip(struct cache_set *c, struct bio *dst,
+			     struct bio *src, unsigned output_available)
+{
+	struct bvec_iter src_iter = src->bi_iter;
+	z_stream strm;
+	struct page *workspace;
+	struct page *inp = NULL;
+	void *k_in = NULL;
+	bool using_mempool = false;
+	int ret;
+
+	BUG_ON(dst->bi_iter.bi_size);
+
+	workspace = mempool_alloc(&c->compression_workspace_pool, GFP_NOIO);
+	strm.workspace = page_address(workspace);
+
+	zlib_deflateInit(&strm, 3);
+	strm.next_in	= NULL;
+	strm.next_out	= NULL;
+	strm.avail_out	= 0;
+	strm.avail_in	= 0;
+
+	while (1) {
+		if (!strm.avail_out) {
+			struct bio_vec *bv = &dst->bi_io_vec[dst->bi_vcnt];
+
+			if (!output_available) {
+				/*
+				 * XXX: this really shouldn't happen, accounting
+				 * is screwed up somehow:
+				 */
+				//pr_err("output_available == 0");
+				goto err;
+			}
+
+			BUG_ON(dst->bi_vcnt >= dst->bi_max_vecs);
+
+			if (k_in) {
+				kunmap_atomic(k_in);
+
+				bch_bio_alloc_page_pool(c, dst, &using_mempool);
+
+				strm.next_in = kmap_atomic(inp) +
+					(((unsigned long) strm.next_in) &
+					 (PAGE_SIZE - 1));
+			} else {
+				bch_bio_alloc_page_pool(c, dst, &using_mempool);
+			}
+
+			strm.next_out = page_address(bv->bv_page);
+			strm.avail_out = min_t(unsigned, PAGE_SIZE,
+					       output_available);
+
+			dst->bi_iter.bi_size	+= strm.avail_out;
+			output_available	-= strm.avail_out;
+		}
+
+		if (!strm.avail_in && src_iter.bi_size &&
+		    output_available > PAGE_SIZE * 3 / 2) {
+			struct bio_vec bv = bio_iter_iovec(src, src_iter);
+
+			if (k_in)
+				kunmap_atomic(k_in);
+
+			strm.avail_in = bv.bv_len;
+			inp = bv.bv_page;
+			k_in = kmap_atomic(inp);
+			strm.next_in = k_in + bv.bv_offset;
+
+			bio_advance_iter(src, &src_iter, strm.avail_in);
+		}
+
+		ret = zlib_deflate(&strm, strm.avail_in
+				   ? Z_NO_FLUSH : Z_FINISH);
+		if (ret == Z_STREAM_END)
+			break;
+
+		BUG_ON(ret != Z_OK);
+	}
+
+	ret = zlib_deflateEnd(&strm);
+	BUG_ON(ret != Z_OK);
+
+	BUG_ON(strm.total_out > dst->bi_iter.bi_size);
+
+	/* caller will pad with 0s to block boundary */
+	dst->bi_iter.bi_size = strm.total_out;
+
+	/* return number of bytes consumed */
+	ret = src->bi_iter.bi_size - src_iter.bi_size;
+out:
+	if (k_in)
+		kunmap_atomic(k_in);
+	if (using_mempool)
+		mutex_unlock(&c->bio_bounce_pages_lock);
+	mempool_free(workspace, &c->compression_workspace_pool);
+
+	return ret;
+err:
+	ret = -1;
+	goto out;
+}
+
+static unsigned bio_compress(struct cache_set *c, struct bio *dst,
+			     struct bio *src, unsigned *compression_type,
+			     unsigned output_available)
+{
+	int ret = 0;
+
+	/* if it's only one block, don't bother trying to compress: */
+	if (bio_sectors(src) <= c->sb.block_size)
+		*compression_type = BCH_COMPRESSION_NONE;
+
+	switch (*compression_type) {
+	case BCH_COMPRESSION_NONE:
+		/* Just bounce it, for stable checksums: */
+copy:
+		bch_bio_alloc_pages_pool(c, dst, output_available);
+		bio_copy_data(dst, src);
+		return output_available;
+	case BCH_COMPRESSION_LZO1X:
+		BUG();
+	case BCH_COMPRESSION_GZIP:
+		ret = bio_compress_gzip(c, dst, src, output_available);
+		break;
+	case BCH_COMPRESSION_XZ:
+		BUG();
+	default:
+		BUG();
+	}
+
+	if (ret < 0) {
+		/* Failed to compress (didn't get smaller): */
+		*compression_type = BCH_COMPRESSION_NONE;
+		goto copy;
+	}
+
+	BUG_ON(ret & ((1 << (c->block_bits + 9)) - 1));
+
+	if (DIV_ROUND_UP(dst->bi_iter.bi_size, block_bytes(c)) >=
+	    ret >> (c->block_bits + 9)) {
+		/* Failed to compress (didn't get smaller): */
+		*compression_type = BCH_COMPRESSION_NONE;
+		goto copy;
+	}
+
+	/* Pad to blocksize, and zero out padding: */
+	while (dst->bi_iter.bi_size & (block_bytes(c) - 1)) {
+		unsigned idx = dst->bi_iter.bi_size >> PAGE_SHIFT;
+		unsigned offset = dst->bi_iter.bi_size & (PAGE_SIZE - 1);
+		unsigned bytes = (PAGE_SIZE - offset) & (block_bytes(c) - 1);
+
+		if (idx < dst->bi_vcnt) {
+			struct bio_vec *bv = &dst->bi_io_vec[idx];
+
+			memset(page_address(bv->bv_page) + offset, 0, bytes);
+		} else {
+			dst->bi_io_vec[dst->bi_vcnt++] = (struct bio_vec) {
+				.bv_page	= ZERO_PAGE(0),
+				.bv_len		= PAGE_SIZE,
+				.bv_offset	= 0,
+			};
+		}
+
+		dst->bi_iter.bi_size += bytes;
+	}
+
+	return ret;
+}
+
+static void __bch_write(struct closure *);
 
 static void bch_write_done(struct closure *cl)
 {
@@ -302,6 +518,11 @@ static void bch_write_done(struct closure *cl)
 	if (!op->write_done)
 		continue_at(cl, __bch_write, op->io_wq);
 
+	if (op->replace_collision) {
+		trace_bcache_promote_collision(&op->replace_info.key.k);
+		atomic_inc(&op->c->accounting.collector.cache_miss_collisions);
+	}
+
 	percpu_ref_put(&op->c->writes);
 	bch_keylist_free(&op->insert_keys);
 	closure_return(cl);
@@ -381,6 +602,7 @@ static void bch_write_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_write_bio *wbio = to_wbio(bio);
 
 	if (bio->bi_error) {
 		/* TODO: We could try to recover from this. */
@@ -393,14 +615,277 @@ static void bch_write_endio(struct bio *bio)
 			set_closure_fn(cl, NULL, NULL);
 	}
 
-	bch_bbio_endio(to_bbio(bio), bio->bi_error, "writing data to cache");
+	if (wbio->orig)
+		bio_endio(wbio->orig);
+	else if (wbio->bounce)
+		bch_bio_free_pages_pool(op->c, bio);
+
+	bch_bbio_endio(&wbio->bio, bio->bi_error, "writing data to cache");
+}
+
+static const unsigned bch_crc_size[] = {
+	[BCH_CSUM_NONE]		= 0,
+	[BCH_CSUM_CRC32C]	= 4,
+	[BCH_CSUM_CRC64]	= 8,
+};
+
+/*
+ * We're writing another replica for this extent, so while we've got the data in
+ * memory we'll be computing a new checksum for the currently live data.
+ *
+ * If there are other replicas we aren't moving, and they are checksummed but
+ * not compressed, we can modify them to point to only the data that is
+ * currently live (so that readers won't have to bounce) while we've got the
+ * checksum we need:
+ *
+ * XXX: to guard against data being corrupted while in memory, instead of
+ * recomputing the checksum here, it would be better in the read path to instead
+ * of computing the checksum of the entire extent:
+ *
+ * | extent                              |
+ *
+ * compute the checksums of the live and dead data separately
+ * | dead data || live data || dead data |
+ *
+ * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
+ * use crc_live here (that we verified was correct earlier)
+ */
+static void extent_cleanup_checksums(struct bkey_s_extent e,
+				     u64 csum, unsigned csum_type)
+{
+	union bch_extent_entry *entry;
+
+	extent_for_each_entry(e, entry)
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			continue;
+		case BCH_EXTENT_ENTRY_crc32:
+			if (entry->crc32.compression_type != BCH_COMPRESSION_NONE ||
+			    bch_crc_size[csum_type] > sizeof(entry->crc32.csum))
+				continue;
+
+			extent_adjust_pointers(e, entry);
+			entry->crc32.compressed_size	= e.k->size;
+			entry->crc32.uncompressed_size	= e.k->size;
+			entry->crc32.offset		= 0;
+			entry->crc32.csum_type		= csum_type;
+			entry->crc32.csum		= csum;
+			break;
+		case BCH_EXTENT_ENTRY_crc64:
+			if (entry->crc64.compression_type != BCH_COMPRESSION_NONE ||
+			    bch_crc_size[csum_type] > sizeof(entry->crc64.csum))
+				continue;
+
+			extent_adjust_pointers(e, entry);
+			entry->crc64.compressed_size	= e.k->size;
+			entry->crc64.uncompressed_size	= e.k->size;
+			entry->crc64.offset		= 0;
+			entry->crc64.csum_type		= csum_type;
+			entry->crc64.csum		= csum;
+			break;
+		}
+}
+
+static void extent_checksum_append(struct bkey_i_extent *e,
+				   unsigned compressed_size,
+				   unsigned uncompressed_size,
+				   unsigned compression_type,
+				   u64 csum, unsigned csum_type)
+{
+	struct bch_extent_ptr *ptr;
+	union bch_extent_crc *crc;
+
+	BUG_ON(compressed_size > uncompressed_size);
+	BUG_ON(uncompressed_size != e->k.size);
+
+	/*
+	 * Look up the last crc entry, so we can check if we need to add
+	 * another:
+	 */
+	extent_for_each_ptr_crc(extent_i_to_s(e), ptr, crc)
+		;
+
+	switch (bch_extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		if (csum_type == BCH_CSUM_NONE &&
+		    compression_type == BCH_COMPRESSION_NONE)
+			return;
+		break;
+	case BCH_EXTENT_CRC32:
+		if (crc->crc32.compressed_size	== compressed_size &&
+		    crc->crc32.uncompressed_size == uncompressed_size &&
+		    crc->crc32.offset		== 0 &&
+		    crc->crc32.compression_type	== compression_type &&
+		    crc->crc32.csum_type	== csum_type &&
+		    crc->crc32.csum		== csum)
+			return;
+		break;
+	case BCH_EXTENT_CRC64:
+		if (crc->crc64.compressed_size	== compressed_size &&
+		    crc->crc64.uncompressed_size == uncompressed_size &&
+		    crc->crc64.offset		== 0 &&
+		    crc->crc32.compression_type	== compression_type &&
+		    crc->crc64.csum_type	== csum_type &&
+		    crc->crc64.csum		== csum)
+			return;
+		break;
+	}
+
+	switch (csum_type) {
+	case BCH_CSUM_NONE:
+	case BCH_CSUM_CRC32C:
+		BUG_ON(compressed_size > CRC32_EXTENT_SIZE_MAX ||
+		       uncompressed_size > CRC32_EXTENT_SIZE_MAX);
+
+		extent_crc32_append(e, (struct bch_extent_crc32) {
+			.compressed_size	= compressed_size,
+			.uncompressed_size	= uncompressed_size,
+			.offset			= 0,
+			.compression_type	= compression_type,
+			.csum_type		= csum_type,
+			.csum			= csum,
+		});
+		break;
+	case BCH_CSUM_CRC64:
+		BUG_ON(compressed_size > CRC64_EXTENT_SIZE_MAX ||
+		       uncompressed_size > CRC64_EXTENT_SIZE_MAX);
+
+		extent_crc64_append(e, (struct bch_extent_crc64) {
+			.compressed_size	= compressed_size,
+			.uncompressed_size	= uncompressed_size,
+			.offset			= 0,
+			.compression_type	= compression_type,
+			.csum_type		= csum_type,
+			.csum			= csum,
+		});
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void bch_write_extent(struct bch_write_op *op,
+			     struct open_bucket *ob,
+			     struct bkey_i *k, struct bio *orig)
+{
+	struct cache_set *c = op->c;
+	struct bio *bio;
+	struct bch_write_bio *wbio;
+	struct bkey_i_extent *e = bkey_i_to_extent(k);
+	struct bch_extent_ptr *ptr;
+	unsigned ptrs_from = 0;
+	unsigned csum_type = CACHE_DATA_PREFERRED_CSUM_TYPE(&c->sb);
+	unsigned compression_type = CACHE_COMPRESSION_TYPE(&c->sb);
+
+	/* don't refetch csum type/compression type */
+	barrier();
+
+	extent_for_each_ptr(extent_i_to_s(e), ptr)
+		ptrs_from++;
+
+	if (csum_type != BCH_CSUM_NONE ||
+	    compression_type != BCH_COMPRESSION_NONE) {
+		/* all units here in bytes */
+		unsigned output_available, input_available, input_consumed;
+		u64 csum;
+
+		BUG_ON(bio_sectors(orig) != k->k.size);
+
+		/* XXX: decide extent size better: */
+		output_available = min(k->k.size,
+				   min(ob->sectors_free,
+				       CRC32_EXTENT_SIZE_MAX)) << 9;
+
+		input_available = min(orig->bi_iter.bi_size,
+				      CRC32_EXTENT_SIZE_MAX << 9);
+
+		/*
+		 * temporarily set input bio's size to the max we want to
+		 * consume from it, in order to avoid overflow in the crc info
+		 */
+		swap(orig->bi_iter.bi_size, input_available);
+
+		bio = bio_alloc_bioset(GFP_NOIO,
+				DIV_ROUND_UP(output_available, PAGE_SIZE),
+				&c->bio_write);
+		wbio			= to_wbio(bio);
+		wbio->orig		= NULL;
+		wbio->bounce		= true;
+
+		input_consumed = bio_compress(c, bio, orig,
+					      &compression_type,
+					      output_available);
+
+		swap(orig->bi_iter.bi_size, input_available);
+
+		bch_key_resize(&k->k, input_consumed >> 9);
+		bio_advance(orig, input_consumed);
+
+		/*
+		 * XXX: could move checksumming out from under the open
+		 * bucket lock - but compression is also being done
+		 * under it
+		 */
+		csum = checksum_bio(bio, csum_type);
+
+		/*
+		 * If possible, adjust existing pointers to only point to
+		 * currently live data, while we have the checksum for that
+		 * data:
+		 */
+		extent_cleanup_checksums(extent_i_to_s(e), csum, csum_type);
+
+		/*
+		 * Add a bch_extent_crc header for the pointers that
+		 * bch_alloc_sectors_done() is going to append:
+		 */
+		extent_checksum_append(e, bio_sectors(bio), e->k.size,
+				       compression_type,
+				       csum, csum_type);
+
+		bch_alloc_sectors_done(op->c, op->wp, k, ob, bio_sectors(bio));
+	} else {
+		if (k->k.size > ob->sectors_free)
+			bch_key_resize(&k->k, ob->sectors_free);
+
+		/*
+		 * We might need a checksum entry, if there's a previous
+		 * checksum entry we need to override:
+		 */
+		extent_checksum_append(e, k->k.size, k->k.size,
+				       compression_type, 0, csum_type);
+
+		bch_alloc_sectors_done(op->c, op->wp, k, ob, k->k.size);
+
+		bio = bio_next_split(orig, k->k.size, GFP_NOIO,
+				     &op->c->bio_write);
+		if (bio == orig)
+			bio_get(bio);
+
+		wbio			= to_wbio(bio);
+		wbio->orig		= NULL;
+		wbio->bounce		= false;
+	}
+
+	bio->bi_end_io	= bch_write_endio;
+	bio->bi_private	= &op->cl;
+	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+#ifndef CONFIG_BCACHE_NO_IO
+	bch_submit_bbio_replicas(wbio, op->c, k, ptrs_from, false);
+#else
+	ptrs_from = ptrs_from;
+	bch_bbio_prep(&wbio->bio, NULL);
+	closure_get(bio->bi_private);
+	bio_endio(bio);
+#endif
 }
 
 static void __bch_write(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bio *bio = &op->bio->bio.bio, *n;
-	unsigned open_bucket_nr = 0, ptrs_from;
+	struct bio *bio = &op->bio->bio.bio;
+	unsigned open_bucket_nr = 0;
 	struct open_bucket *b;
 
 	memset(op->open_buckets, 0, sizeof(op->open_buckets));
@@ -412,8 +897,9 @@ static void __bch_write(struct closure *cl)
 		continue_at(cl, bch_write_done, op->c->wq);
 	}
 
-	bch_extent_drop_stale(op->c, bkey_i_to_s(&op->insert_key));
-	ptrs_from = bch_extent_ptrs(bkey_i_to_s_extent(&op->insert_key));
+	if (bkey_extent_is_data(&op->insert_key.k))
+		bch_extent_drop_stale(op->c,
+				      bkey_i_to_s_extent(&op->insert_key));
 
 	/*
 	 * Journal writes are marked REQ_PREFLUSH; if the original write was a
@@ -438,9 +924,9 @@ static void __bch_write(struct closure *cl)
 		k = op->insert_keys.top;
 		bkey_copy(k, &op->insert_key);
 
-		b = bch_alloc_sectors(op->c, op->wp, k,
-				      op->check_enospc,
-				      op->nowait ? NULL : cl);
+		b = bch_alloc_sectors_start(op->c, op->wp,
+					    op->check_enospc,
+					    op->nowait ? NULL : cl);
 		BUG_ON(!b);
 
 		if (PTR_ERR(b) == -EAGAIN) {
@@ -458,30 +944,15 @@ static void __bch_write(struct closure *cl)
 
 		op->open_buckets[open_bucket_nr++] = b;
 
+		/*
+		 * XXX: if we compressed, we didn't use all the space we just
+		 * allocated
+		 */
+		bch_write_extent(op, b, k, bio);
 		bch_cut_front(k->k.p, &op->insert_key);
 
-		n = bio_next_split(bio, k->k.size, GFP_NOIO,
-				   &op->c->bio_write);
-		if (n == bio)
-			bio_get(bio);
-
-		n->bi_end_io	= bch_write_endio;
-		n->bi_private	= cl;
-#if 0
-		if (KEY_CSUM(k))
-			bio_csum(n, k);
-#endif
-		trace_bcache_cache_insert(&k->k);
-
-		bio_set_op_attrs(n, REQ_OP_WRITE, 0);
-#ifndef CONFIG_BCACHE_NO_IO
-		bch_submit_bbio_replicas(to_wbio(n), op->c, k,
-					 ptrs_from, false);
-#else
-		bch_bbio_prep(to_bbio(n), NULL);
-		closure_get(n->bi_private);
-		bio_endio(n);
-#endif
+		BUG_ON(op->insert_key.k.size &&
+		       op->insert_key.k.size != bio_sectors(bio));
 
 		BUG_ON(bch_extent_normalize(op->c, bkey_i_to_s(k)));
 		bch_check_mark_super(op->c, k, false);
@@ -489,7 +960,9 @@ static void __bch_write(struct closure *cl)
 		bkey_extent_set_cached(&k->k, op->cached);
 
 		bch_keylist_enqueue(&op->insert_keys);
-	} while (n != bio);
+
+		trace_bcache_cache_insert(&k->k);
+	} while (op->insert_key.k.size);
 
 	op->write_done = true;
 	continue_at(cl, bch_write_index, op->c->wq);
@@ -775,69 +1248,17 @@ struct cache_promote_op {
 	struct closure		cl;
 	struct bio		*orig_bio;
 	struct bch_write_op	iop;
-	bool			stale; /* was the ptr stale after the read? */
 	struct bch_write_bio	bio; /* must be last */
 };
 
 static void cache_promote_done(struct closure *cl)
 {
-	struct cache_promote_op *op = container_of(cl,
-					struct cache_promote_op, cl);
-	struct cache_set *c = op->iop.c;
-
-	if (op->iop.replace_collision) {
-		trace_bcache_promote_collision(&op->iop.replace_info.key.k);
-		atomic_inc(&c->accounting.collector.cache_miss_collisions);
-	}
-
-	bch_bio_free_pages(&op->iop.bio->bio.bio);
-	kfree(op);
-}
-
-static void cache_promote_write(struct closure *cl)
-{
-	struct cache_promote_op *op = container_of(cl,
-					struct cache_promote_op, cl);
-	struct bio *bio = &op->iop.bio->bio.bio;
-
-	bio_reset(bio);
-	bio->bi_iter.bi_sector	= bkey_start_offset(&op->iop.insert_key.k);
-	bio->bi_iter.bi_size	= op->iop.insert_key.k.size << 9;
-	/* needed to reinit bi_vcnt so pages can be freed later */
-	bch_bio_map(bio, NULL);
-
-	bio_copy_data(op->orig_bio, bio);
-	op->orig_bio->bi_error = op->iop.error;
-	bio_endio(op->orig_bio);
-
-	if (!op->stale &&
-	    !op->iop.error &&
-	    !test_bit(CACHE_SET_RO, &op->iop.c->flags) &&
-	    !test_bit(CACHE_SET_STOPPING, &op->iop.c->flags))
-		closure_call(&op->iop.cl, bch_write, NULL, cl);
-
-	closure_return_with_destructor(cl, cache_promote_done);
-}
-
-static void cache_promote_endio(struct bio *bio)
-{
-	struct bbio *b = to_bbio(bio);
 	struct cache_promote_op *op =
-		container_of(bio, struct cache_promote_op, bio.bio.bio);
+		container_of(cl, struct cache_promote_op, cl);
 
-	/*
-	 * If the bucket was reused while our bio was in flight, we might have
-	 * read the wrong data. Set s->error but not error so it doesn't get
-	 * counted against the cache device, but we'll still reread the data
-	 * from the backing device.
-	 */
-
-	if (bio->bi_error)
-		op->iop.error = bio->bi_error;
-	else if (b->ca && ptr_stale(b->ca, &b->ptr))
-		op->stale = 1;
-
-	bch_bbio_endio(b, bio->bi_error, "reading from cache");
+	bch_bio_free_pages_pool(op->iop.c, op->orig_bio);
+	bio_put(op->orig_bio);
+	kfree(op);
 }
 
 /**
@@ -852,6 +1273,7 @@ void __cache_promote(struct cache_set *c, struct bbio *orig_bio,
 		     struct bkey_s_c new,
 		     unsigned write_flags)
 {
+#if 0
 	struct cache_promote_op *op;
 	struct bio *bio;
 	unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE);
@@ -886,10 +1308,12 @@ void __cache_promote(struct cache_set *c, struct bbio *orig_bio,
 	op->stale		= 0;
 
 	bch_write_op_init(&op->iop, c, &op->bio, &c->promote_write_point,
-			  new, old, BCH_WRITE_CHECK_ENOSPC|write_flags);
+			  new, old,
+			  BCH_WRITE_CHECK_ENOSPC|
+			  BCH_WRITE_ALLOC_NOWAIT|write_flags);
 
-	bch_cut_front(bkey_start_pos(&orig_bio->key.k), &op->iop.insert_key);
-	bch_cut_back(orig_bio->key.k.p, &op->iop.insert_key.k);
+	//bch_cut_front(bkey_start_pos(&orig_bio->key.k), &op->iop.insert_key);
+	//bch_cut_back(orig_bio->key.k.p, &op->iop.insert_key.k);
 
 	trace_bcache_promote(&orig_bio->bio);
 
@@ -901,24 +1325,7 @@ out_free:
 	kfree(op);
 out_submit:
 	generic_make_request(&orig_bio->bio);
-}
-
-/**
- * cache_promote - promote data stored in higher tiers
- *
- * Used for flash only volumes.
- *
- * @bio must actually be a bbio with valid key.
- */
-bool cache_promote(struct cache_set *c, struct bbio *bio, struct bkey_s_c k)
-{
-	if (!CACHE_TIER(&bio->ca->mi)) {
-		generic_make_request(&bio->bio);
-		return 0;
-	}
-
-	__cache_promote(c, bio, k, k, BCH_WRITE_ALLOC_NOWAIT);
-	return 1;
+#endif
 }
 
 /* Read */
@@ -927,36 +1334,325 @@ static void bch_read_requeue(struct cache_set *c, struct bio *bio)
 {
 	unsigned long flags;
 
+	BUG();
+
 	spin_lock_irqsave(&c->read_race_lock, flags);
 	bio_list_add(&c->read_race_list, bio);
 	spin_unlock_irqrestore(&c->read_race_lock, flags);
 	queue_work(c->wq, &c->read_race_work);
 }
 
-static void bch_read_endio(struct bio *bio)
+static int bio_uncompress_gzip(struct cache_set *c,
+			       struct bio *dst, struct bvec_iter dst_iter,
+			       struct bio *src, struct bvec_iter src_iter,
+			       unsigned skip)
 {
-	struct bbio *b = to_bbio(bio);
-	struct cache *ca = b->ca;
-	struct bio *orig = bio->bi_private;
-
-	bch_bbio_count_io_errors(b, bio->bi_error, "reading from cache");
-
-	if (!bio->bi_error && ca &&
-	    (race_fault() ||
-	     ptr_stale(ca, &b->ptr))) {
-		/* Read bucket invalidate race */
-		atomic_long_inc(&ca->set->cache_read_races);
-		bch_read_requeue(ca->set, bio);
+	z_stream strm;
+	struct page *workspace;
+	void *k_out = NULL;
+	u8 garbage[128];
+	int ret;
+	bool decompress_all = true;
+
+	workspace = mempool_alloc(&c->compression_workspace_pool, GFP_NOIO);
+	strm.workspace = page_address(workspace);
+
+	zlib_inflateInit(&strm);
+	strm.next_in	= NULL;
+	strm.next_out	= NULL;
+	strm.avail_out	= 0;
+	strm.avail_in	= 0;
+
+	do {
+		if (strm.avail_out) {
+			;
+		} else if (skip) {
+			strm.avail_out = min_t(unsigned, sizeof(garbage), skip);
+			strm.next_out = garbage;
+
+			skip -= strm.avail_out;
+		} else if (dst_iter.bi_size) {
+			struct bio_vec bv = bio_iter_iovec(dst, dst_iter);
+
+			if (k_out)
+				kunmap_atomic(k_out);
+			k_out = kmap_atomic(bv.bv_page) + bv.bv_offset;
+
+			strm.avail_out = bv.bv_len;
+			strm.next_out = k_out;
+
+			bio_advance_iter(dst, &dst_iter, bv.bv_len);
+		} else {
+			/* Uncompressed all the data we actually want: */
+			if (!decompress_all) {
+				ret = Z_STREAM_END;
+				break;
+			}
+
+			strm.avail_out = sizeof(garbage);
+			strm.next_out = garbage;
+		}
+
+		if (!strm.avail_in && src_iter.bi_size) {
+			struct bio_vec bv = bio_iter_iovec(src, src_iter);
+
+			strm.avail_in = bv.bv_len;
+			strm.next_in = page_address(bv.bv_page) + bv.bv_offset;
+
+			bio_advance_iter(src, &src_iter, bv.bv_len);
+		}
+	} while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK);
+
+	if (k_out)
+		kunmap_atomic(k_out);
+
+	mempool_free(workspace, &c->compression_workspace_pool);
+
+	return ret == Z_STREAM_END ? 0 : -EIO;
+}
+
+static int bio_checksum_uncompress(struct bch_read_bio *rbio)
+{
+	struct bio *bio = &rbio->bio.bio;
+	int ret = 0;
+
+	/* reset iterator for checksum */
+	bio->bi_iter.bi_size		= rbio->compressed_size << 9;
+	bio->bi_iter.bi_idx		= 0;
+	bio->bi_iter.bi_bvec_done	= 0;
+
+	if (rbio->csum_type != BCH_CSUM_NONE &&
+	    rbio->csum != checksum_bio(bio, rbio->csum_type)) {
+		/*
+		 * XXX: bch_bbio_count_io_errors() isn't counting checksum
+		 * errors
+		 */
+		__bcache_io_error(rbio->c, "checksum error");
+		return -EIO;
+	}
+
+	switch (rbio->compression_type) {
+	case BCH_COMPRESSION_NONE:
+		if (rbio->bounce) {
+			bio_advance(bio, rbio->offset << 9);
+			bio_copy_data_iter(rbio->parent, rbio->parent_iter,
+					   bio, bio->bi_iter);
+		}
+		break;
+	case BCH_COMPRESSION_LZO1X:
+		BUG();
+	case BCH_COMPRESSION_GZIP:
+		ret = bio_uncompress_gzip(rbio->c,
+					  rbio->parent,
+					  rbio->parent_iter,
+					  bio, bio->bi_iter,
+					  rbio->offset << 9);
+		break;
+	case BCH_COMPRESSION_XZ:
+		BUG();
+	default:
+		BUG();
+	}
+
+	if (ret)
+		__bcache_io_error(rbio->c, "decompression error");
+
+	return ret;
+}
+
+/* Inner part that may run in process context */
+static void __bch_read_endio(struct bch_read_bio *rbio)
+{
+	struct bio *bio = &rbio->bio.bio;
+	int ret;
+
+	ret = bio_checksum_uncompress(rbio);
+	if (ret)
+		rbio->parent->bi_error = ret;
+	bio_endio(rbio->parent);
+
+	if (!ret && rbio->promote &&
+	    !test_bit(CACHE_SET_RO, &rbio->c->flags) &&
+	    !test_bit(CACHE_SET_STOPPING, &rbio->c->flags)) {
+		struct closure *cl = &rbio->promote->cl;
+
+		closure_init(cl, &rbio->c->cl);
+		closure_call(&rbio->promote->iop.cl, bch_write, rbio->c->wq, cl);
+		closure_return_with_destructor(cl, cache_promote_done);
 	} else {
-		if (bio->bi_error)
-			orig->bi_error = bio->bi_error;
+		if (rbio->promote)
+			kfree(rbio->promote);
+		if (rbio->bounce)
+			bch_bio_free_pages_pool(rbio->c, bio);
 
-		bio_endio(orig);
 		bio_put(bio);
 	}
+}
 
-	if (ca)
-		percpu_ref_put(&ca->ref);
+void bch_bio_decompress_work(struct work_struct *work)
+{
+	struct bio_decompress_worker *d =
+		container_of(work, struct bio_decompress_worker, work);
+	struct llist_node *list, *next;
+	struct bch_read_bio *rbio;
+
+	while ((list = llist_del_all(&d->bio_list)))
+		for (list = llist_reverse_order(list);
+		     list;
+		     list = next) {
+			next = llist_next(list);
+			rbio = container_of(list, struct bch_read_bio, list);
+
+			__bch_read_endio(rbio);
+		}
+}
+
+static void bch_read_endio(struct bio *bio)
+{
+	struct bch_read_bio *rbio =
+		container_of(bio, struct bch_read_bio, bio.bio);
+	bool stale = //race_fault() ||
+		ptr_stale(rbio->bio.ca, &rbio->bio.ptr);
+	int error = bio->bi_error;
+
+	bch_bbio_count_io_errors(&rbio->bio, error, "reading from cache");
+	percpu_ref_put(&rbio->bio.ca->ref);
+
+	if (error)
+		goto out;
+
+	if (stale)
+		goto stale;
+
+	if (rbio->compression_type != BCH_COMPRESSION_NONE) {
+		struct bio_decompress_worker *d;
+
+		preempt_disable();
+		d = this_cpu_ptr(rbio->c->bio_decompress_worker);
+		llist_add(&rbio->list, &d->bio_list);
+		queue_work(system_unbound_wq, &d->work);
+		preempt_enable();
+	} else {
+		__bch_read_endio(rbio);
+	}
+
+	return;
+stale:
+	if (rbio->promote)
+		kfree(rbio->promote);
+	rbio->promote = NULL;
+
+	/* Raced with the bucket being reused and invalidated: */
+	if (rbio->flags & BCH_READ_RETRY_IF_STALE) {
+		atomic_long_inc(&rbio->c->cache_read_races);
+		bch_read_requeue(rbio->c, bio);
+		return;
+	}
+
+	error = -EINTR;
+out:
+	if (rbio->promote)
+		kfree(rbio->promote);
+	if (error)
+		rbio->parent->bi_error = error;
+	bio_endio(rbio->parent);
+	bio_put(bio);
+}
+
+void bch_read_extent(struct cache_set *c, struct bio *orig,
+		     struct bkey_s_c k, struct extent_pick_ptr *pick,
+		     unsigned skip, unsigned flags)
+{
+	struct bio *bio;
+	struct bch_read_bio *rbio;
+	struct cache_promote_op *promote_op = NULL;
+	bool bounce = false, read_full = false;
+
+	/* only promote if we're not reading from the fastest tier: */
+	if ((flags & BCH_READ_PROMOTE) && CACHE_TIER(&pick->ca->mi)) {
+		promote_op = kmalloc(sizeof(*promote_op), GFP_NOIO);
+
+		if (promote_op)
+			bounce = true;
+	}
+
+	/*
+	 * note: if compression_type and crc_type both == none, then
+	 * compressed/uncompressed size is zero
+	 */
+	if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
+	    (pick->crc.csum_type != BCH_CSUM_NONE &&
+	     (bio_sectors(orig) != pick->crc.uncompressed_size ||
+	      (flags & BCH_READ_FORCE_BOUNCE)))) {
+		read_full = true;
+		bounce = true;
+	}
+
+	if (bounce) {
+		unsigned sectors =
+			!read_full ? bio_sectors(orig)
+			: pick->crc.compressed_size ?: k.k->size;
+
+		bio = bio_alloc_bioset(GFP_NOIO,
+				DIV_ROUND_UP(sectors, PAGE_SECTORS),
+				&c->bio_read);
+		bch_bio_alloc_pages_pool(c, bio, sectors << 9);
+	} else {
+		bio = bio_clone_fast(orig, GFP_NOIO, &c->bio_read);
+	}
+
+	rbio = container_of(bio, struct bch_read_bio, bio.bio);
+	memset(rbio, 0, offsetof(struct bch_read_bio, bio));
+
+	rbio->csum		= pick->crc.csum;
+	rbio->compressed_size	= pick->crc.compressed_size;
+	rbio->uncompressed_size	= pick->crc.uncompressed_size;
+	rbio->offset		= pick->crc.offset;
+	rbio->csum_type		= pick->crc.csum_type;
+	rbio->compression_type	= pick->crc.compression_type;
+
+	__bio_inc_remaining(orig);
+	rbio->parent		= orig;
+	rbio->parent_iter	= orig->bi_iter;
+	rbio->c			= c;
+	rbio->flags		= flags;
+	rbio->bounce		= bounce;
+	rbio->promote		= promote_op;
+	rbio->bio.ptr		= pick->ptr;
+	bio->bi_end_io		= bch_read_endio;
+	bch_bbio_prep(&rbio->bio, pick->ca);
+
+	if (read_full)
+		rbio->offset += skip;
+	else
+		bio->bi_iter.bi_sector += skip;
+
+	if (promote_op) {
+		promote_op->orig_bio = bio;
+
+		bch_write_op_init(&promote_op->iop, c,
+				  &promote_op->bio,
+				  &c->promote_write_point,
+				  k, k,
+				  BCH_WRITE_CHECK_ENOSPC|
+				  BCH_WRITE_ALLOC_NOWAIT);
+
+		if (!read_full) {
+			bch_cut_front(POS(k.k->p.inode,
+					  bkey_start_offset(k.k) + skip),
+				      &promote_op->iop.insert_key);
+			bch_key_resize(&promote_op->iop.insert_key.k,
+				       bio_sectors(orig));
+		}
+
+		__bio_clone_fast(&promote_op->bio.bio.bio, bio);
+	}
+
+#ifndef CONFIG_BCACHE_NO_IO
+	generic_make_request(bio);
+#else
+	bio_endio(bio);
+#endif
 }
 
 /* XXX: this looks a lot like cache_lookup_fn() */
@@ -970,9 +1666,7 @@ int bch_read(struct cache_set *c, struct bio *bio, u64 inode)
 	for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
 				      POS(inode, bio->bi_iter.bi_sector), k) {
 		struct extent_pick_ptr pick;
-		struct bio *n;
-		struct bbio *bbio;
-		unsigned sectors;
+		unsigned bytes, sectors;
 		bool done;
 
 		BUG_ON(bkey_cmp(bkey_start_pos(k.k),
@@ -981,8 +1675,12 @@ int bch_read(struct cache_set *c, struct bio *bio, u64 inode)
 		BUG_ON(bkey_cmp(k.k->p,
 				POS(inode, bio->bi_iter.bi_sector)) <= 0);
 
-		sectors = k.k->p.offset - bio->bi_iter.bi_sector;
-		done = sectors >= bio_sectors(bio);
+		sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
+			bio->bi_iter.bi_sector;
+		bytes = sectors << 9;
+		done = bytes == bio->bi_iter.bi_size;
+
+		swap(bio->bi_iter.bi_size, bytes);
 
 		pick = bch_extent_pick_ptr(c, k);
 		if (IS_ERR(pick.ca)) {
@@ -994,43 +1692,19 @@ int bch_read(struct cache_set *c, struct bio *bio, u64 inode)
 			PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
 				c->prio_clock[READ].hand;
 
-			n = sectors >= bio_sectors(bio)
-				? bio_clone_fast(bio, GFP_NOIO, &c->bio_split)
-				: bio_split(bio, sectors, GFP_NOIO,
-					    &c->bio_split);
-
-			n->bi_private		= bio;
-			n->bi_end_io		= bch_read_endio;
-			__bio_inc_remaining(bio);
-
-			bbio = to_bbio(n);
-			bbio->key.k = *k.k;
-			bbio->ptr = pick.ptr;
-			bch_set_extent_ptrs(bkey_i_to_s_extent(&bbio->key), 1);
-
-			/* Trim the key to match what we're actually reading */
-			bch_cut_front(POS(inode, n->bi_iter.bi_sector),
-				      &bbio->key);
-			bch_cut_back(POS(inode, bio_end_sector(n)),
-				     &bbio->key.k);
-			bch_bbio_prep(bbio, pick.ca);
-
-#ifndef CONFIG_BCACHE_NO_IO
-			cache_promote(c, bbio, k);
-#else
-			bio_endio(n);
-#endif
+			bch_read_extent(c, bio, k, &pick,
+					bio->bi_iter.bi_sector -
+					bkey_start_offset(k.k),
+					BCH_READ_FORCE_BOUNCE|
+					BCH_READ_RETRY_IF_STALE|
+					BCH_READ_PROMOTE);
 		} else {
-			unsigned bytes = min_t(unsigned, sectors,
-					       bio_sectors(bio)) << 9;
-
-			swap(bio->bi_iter.bi_size, bytes);
 			zero_fill_bio(bio);
-			swap(bio->bi_iter.bi_size, bytes);
-
-			bio_advance(bio, bytes);
 		}
 
+		swap(bio->bi_iter.bi_size, bytes);
+		bio_advance(bio, bytes);
+
 		if (done) {
 			bch_btree_iter_unlock(&iter);
 			return 0;
@@ -1069,7 +1743,8 @@ static void bch_read_retry(struct bbio *bbio)
 	 * The inode, offset and size come from the bbio's key,
 	 * which was set by bch_read_fn().
 	 */
-	inode = bbio->key.k.p.inode;
+	BUG(); /* currently broken */
+	//inode = bbio->key.k.p.inode;
 	parent = bio->bi_private;
 
 	bch_bbio_reset(bbio);
diff --git a/drivers/md/bcache/io.h b/drivers/md/bcache/io.h
index fb024d2e5fa8..53c8b3aa07ea 100644
--- a/drivers/md/bcache/io.h
+++ b/drivers/md/bcache/io.h
@@ -1,17 +1,16 @@
 #ifndef _BCACHE_IO_H
 #define _BCACHE_IO_H
 
-struct bbio {
-	struct cache		*ca;
+#include <linux/zlib.h>
 
-	unsigned int		bi_idx;		/* current index into bvl_vec */
+#define COMPRESSION_WORKSPACE_SIZE					\
+	max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),	\
+	    zlib_inflate_workspacesize())
 
-	unsigned int            bi_bvec_done;	/* number of bytes completed in
-						   current bvec */
-	unsigned		submit_time_us;
-	struct bkey_i		key;
+struct bbio {
+	struct cache		*ca;
 	struct bch_extent_ptr	ptr;
-	/* Only ever have a single pointer (the one we're doing io to/from) */
+	unsigned		submit_time_us;
 	struct bio		bio;
 };
 
@@ -95,6 +94,41 @@ void bch_write_op_init(struct bch_write_op *, struct cache_set *,
 		       struct bkey_s_c, struct bkey_s_c, unsigned);
 void bch_write(struct closure *);
 
+struct cache_promote_op;
+
+struct bch_read_bio {
+	struct bio		*parent;
+	struct bvec_iter	parent_iter;
+
+	struct cache_set	*c;
+	unsigned		flags;
+
+	/* fields align with bch_extent_crc64 */
+	u64			bounce:3,
+				compressed_size:18,
+				uncompressed_size:18,
+				offset:17,
+				csum_type:4,
+				compression_type:4;
+	u64			csum;
+
+	struct cache_promote_op *promote;
+
+	struct llist_node	list;
+	struct bbio		bio;
+};
+
+struct extent_pick_ptr;
+
+void bch_read_extent(struct cache_set *, struct bio *, struct bkey_s_c,
+		     struct extent_pick_ptr *, unsigned, unsigned);
+
+enum bch_read_flags {
+	BCH_READ_FORCE_BOUNCE		= 1 << 0,
+	BCH_READ_RETRY_IF_STALE		= 1 << 1,
+	BCH_READ_PROMOTE		= 1 << 2,
+};
+
 int bch_read(struct cache_set *, struct bio *, u64);
 
 void bch_cache_io_error_work(struct work_struct *);
@@ -104,8 +138,7 @@ void bch_bbio_endio(struct bbio *, int, const char *);
 
 void bch_generic_make_request(struct bio *, struct cache_set *);
 void bch_bio_submit_work(struct work_struct *);
-void bch_bbio_prep(struct bbio *, struct cache *);
-void bch_submit_bbio(struct bbio *, struct cache *, const struct bkey_i *,
+void bch_submit_bbio(struct bbio *, struct cache *,
 		     const struct bch_extent_ptr *, bool);
 void bch_submit_bbio_replicas(struct bch_write_bio *, struct cache_set *,
 			      const struct bkey_i *, unsigned, bool);
@@ -119,6 +152,8 @@ bool cache_promote(struct cache_set *, struct bbio *, struct bkey_s_c);
 void bch_read_race_work(struct work_struct *);
 void bch_wake_delayed_writes(unsigned long data);
 
+void bch_bio_decompress_work(struct work_struct *);
+
 extern struct workqueue_struct *bcache_io_wq;
 
 #endif /* _BCACHE_IO_H */
diff --git a/drivers/md/bcache/io_types.h b/drivers/md/bcache/io_types.h
new file mode 100644
index 000000000000..2a8e7c6a7386
--- /dev/null
+++ b/drivers/md/bcache/io_types.h
@@ -0,0 +1,12 @@
+#ifndef _BCACHE_IO_TYPES_H
+#define _BCACHE_IO_TYPES_H
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bio_decompress_worker {
+	struct work_struct		work;
+	struct llist_head		bio_list;
+};
+
+#endif /* _BCACHE_IO_TYPES_H */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 0a7550a0294d..08879ba95be7 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -1170,7 +1170,7 @@ static void journal_next_bucket(struct cache_set *c)
 	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
 	struct bch_extent_ptr *ptr;
 	struct cache *ca;
-	unsigned iter;
+	unsigned iter, replicas;
 
 	lockdep_assert_held(&j->lock);
 
@@ -1198,7 +1198,11 @@ static void journal_next_bucket(struct cache_set *c)
 		if (!(ca = PTR_CACHE(c, ptr)) ||
 		    CACHE_STATE(&ca->mi) != CACHE_ACTIVE ||
 		    ca->journal.sectors_free <= j->sectors_free)
-			bch_extent_drop_ptr(e, ptr);
+			__bch_extent_drop_ptr(e, ptr);
+
+	replicas = 0;
+	extent_for_each_ptr(e, ptr)
+		replicas++;
 
 	/*
 	 * Determine location of the next journal write:
@@ -1209,7 +1213,7 @@ static void journal_next_bucket(struct cache_set *c)
 		unsigned next, remaining, nr_buckets =
 			bch_nr_journal_buckets(&ca->sb);
 
-		if (bch_extent_ptrs(e) == CACHE_SET_META_REPLICAS_WANT(&c->sb))
+		if (replicas >= CACHE_SET_META_REPLICAS_WANT(&c->sb))
 			break;
 
 		/*
@@ -1247,28 +1251,25 @@ static void journal_next_bucket(struct cache_set *c)
 		if (!remaining)
 			continue;
 
-		BUG_ON(bch_extent_ptrs(e) >= BKEY_EXTENT_PTRS_MAX);
-
 		ja->sectors_free = ca->mi.bucket_size;
-
 		ja->cur_idx = next;
-		e.v->ptr[bch_extent_ptrs(e)] = (struct bch_extent_ptr) {
-			.gen = 0,
-			.dev = ca->sb.nr_this_dev,
-			.offset = bucket_to_sector(ca,
-					journal_bucket(ca, ja->cur_idx)),
-		};
-
 		ja->bucket_seq[ja->cur_idx] = j->seq;
 
+		extent_ptr_append(bkey_i_to_extent(&j->key),
+			(struct bch_extent_ptr) {
+				  .offset = bucket_to_sector(ca,
+					journal_bucket(ca, ja->cur_idx)),
+				  .dev = ca->sb.nr_this_dev,
+		});
+		replicas++;
+
 		trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
-		bch_set_extent_ptrs(e, bch_extent_ptrs(e) + 1);
 	}
 
 	/* set j->sectors_free to the min of any device */
 	j->sectors_free = UINT_MAX;
 
-	if (bch_extent_ptrs(e) == CACHE_SET_META_REPLICAS_WANT(&c->sb))
+	if (replicas >= CACHE_SET_META_REPLICAS_WANT(&c->sb))
 		extent_for_each_online_device(c, e, ptr, ca)
 			j->sectors_free = min(j->sectors_free,
 					      ca->journal.sectors_free);
diff --git a/drivers/md/bcache/migrate.c b/drivers/md/bcache/migrate.c
index 66bf35c082c5..4ee369a4b7a9 100644
--- a/drivers/md/bcache/migrate.c
+++ b/drivers/md/bcache/migrate.c
@@ -136,14 +136,6 @@ static enum migrate_option migrate_cleanup_key(struct cache_set *c,
 		return MIGRATE_IGNORE;
 	}
 
-	/*
-	 * Remove all pointers, to avoid too many in a tier.
-	 * migrate_compact_key above does the same when nr_replicas is 1, and
-	 * doesn't actually work if nr_replicas > 1, so do something simple
-	 * instead.  Effectively, every migration copy is a fresh 'foreground'
-	 * write.
-	 */
-	bch_set_extent_ptrs(e, 0);
 	return MIGRATE_COPY;
 }
 
diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c
index 2aed02880a36..87dcac33cb4b 100644
--- a/drivers/md/bcache/move.c
+++ b/drivers/md/bcache/move.c
@@ -427,9 +427,8 @@ void bch_queue_recalc_oldest_gens(struct cache_set *c, struct moving_queue *q)
 
 static void read_moving_endio(struct bio *bio)
 {
-	struct bbio *b = container_of(bio, struct bbio, bio);
-	struct moving_io *io = container_of(bio->bi_private,
-					    struct moving_io, cl);
+	struct closure *cl = bio->bi_private;
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct moving_queue *q = io->q;
 	struct moving_context *ctxt = io->context;
 	bool stopped;
@@ -439,11 +438,9 @@ static void read_moving_endio(struct bio *bio)
 	if (bio->bi_error) {
 		io->op.error = bio->bi_error;
 		moving_error(io->context, MOVING_FLAG_READ);
-	} else if (ptr_stale(b->ca, &bkey_i_to_extent_c(&b->key)->v.ptr[0])) {
-		io->op.error = -EINTR;
 	}
 
-	bch_bbio_endio(b, bio->bi_error, "reading data to move");
+	bio_put(bio);
 
 	spin_lock_irqsave(&q->lock, flags);
 
@@ -488,7 +485,10 @@ static void __bch_data_move(struct closure *cl)
 	bio_set_op_attrs(&io->bio.bio.bio, REQ_OP_READ, 0);
 	io->bio.bio.bio.bi_end_io	= read_moving_endio;
 
-	bch_submit_bbio(&io->bio.bio, pick.ca, &io->key, &pick.ptr, false);
+	bch_read_extent(io->op.c, &io->bio.bio.bio,
+			bkey_i_to_s_c(&io->key),
+			&pick, 0, 0);
+	bio_endio(&io->bio.bio.bio);
 }
 
 /*
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 17a910ef114e..167c2f185f0e 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -364,9 +364,9 @@ static int cached_dev_cache_miss(struct btree_iter *iter, struct search *s,
 	miss->bi_end_io		= request_endio;
 	miss->bi_private	= &s->cl;
 
-	to_bbio(miss)->key.k = KEY(s->inode,
-				   bio_end_sector(miss),
-				   bio_sectors(miss));
+	//to_bbio(miss)->key.k = KEY(s->inode,
+	//			   bio_end_sector(miss),
+	//			   bio_sectors(miss));
 	to_bbio(miss)->ca = NULL;
 
 	closure_get(&s->cl);
@@ -375,7 +375,7 @@ static int cached_dev_cache_miss(struct btree_iter *iter, struct search *s,
 			bkey_to_s_c(&KEY(replace.key.k.p.inode,
 					 replace.key.k.p.offset,
 					 replace.key.k.size)),
-			BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED);
+			BCH_WRITE_CACHED);
 
 	return 0;
 nopromote:
@@ -388,23 +388,6 @@ nopromote:
 	return 0;
 }
 
-static void bch_cache_read_endio(struct bio *bio)
-{
-	struct bbio *b = to_bbio(bio);
-	struct closure *cl = bio->bi_private;
-	struct search *s = container_of(cl, struct search, cl);
-
-	if (bio->bi_error)
-		s->iop.error = bio->bi_error;
-	else if (ptr_stale(b->ca, &b->ptr)) {
-		/* Read bucket invalidate race */
-		atomic_long_inc(&s->iop.c->cache_read_races);
-		s->iop.error = -EINTR;
-	}
-
-	bch_bbio_endio(b, bio->bi_error, "reading from cache");
-}
-
 static void cached_dev_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
@@ -417,9 +400,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
 	for_each_btree_key_with_holes(&iter, s->iop.c, BTREE_ID_EXTENTS,
 				POS(s->inode, bio->bi_iter.bi_sector), k) {
 		struct extent_pick_ptr pick;
-		struct bio *n;
-		struct bbio *bbio;
-		unsigned sectors;
+		unsigned sectors, bytes;
 		bool done;
 retry:
 		BUG_ON(bkey_cmp(bkey_start_pos(k.k),
@@ -428,8 +409,12 @@ retry:
 		BUG_ON(bkey_cmp(k.k->p,
 				POS(s->inode, bio->bi_iter.bi_sector)) <= 0);
 
-		sectors = k.k->p.offset - bio->bi_iter.bi_sector;
-		done = sectors >= bio_sectors(bio);
+		sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
+			bio->bi_iter.bi_sector;
+		bytes = sectors << 9;
+		done = bytes == bio->bi_iter.bi_size;
+
+		swap(bio->bi_iter.bi_size, bytes);
 
 		pick = bch_extent_pick_ptr(s->iop.c, k);
 		if (IS_ERR(pick.ca)) {
@@ -452,33 +437,17 @@ retry:
 			if (!bkey_extent_is_cached(k.k))
 				s->read_dirty_data = true;
 
-			n = bio_next_split(bio, sectors, GFP_NOIO,
-					   &s->d->bio_split);
-
-			bbio = to_bbio(n);
-			bbio->key.k = *k.k;
-			bbio->ptr = pick.ptr;
-			bch_set_extent_ptrs(bkey_i_to_s_extent(&bbio->key), 1);
-
-			/* Trim the key to match what we're actually reading */
-			bch_cut_front(POS(s->inode, n->bi_iter.bi_sector),
-				      &bbio->key);
-			bch_cut_back(POS(s->inode, bio_end_sector(n)),
-				     &bbio->key.k);
-
-			bch_bbio_prep(bbio, pick.ca);
-
-			n->bi_end_io		= bch_cache_read_endio;
-			n->bi_private		= &s->cl;
-
-			closure_get(&s->cl);
-			if (!s->bypass) {
-				if (cache_promote(s->iop.c, bbio, k))
-					s->cache_miss = 1;
-			} else
-				submit_bio(n);
+			bch_read_extent(s->iop.c, bio, k, &pick,
+					bio->bi_iter.bi_sector -
+					bkey_start_offset(k.k),
+					BCH_READ_FORCE_BOUNCE|
+					BCH_READ_RETRY_IF_STALE|
+					(!s->bypass ? BCH_READ_PROMOTE : 0));
 		}
 
+		swap(bio->bi_iter.bi_size, bytes);
+		bio_advance(bio, bytes);
+
 		if (done) {
 			bch_btree_iter_unlock(&iter);
 			goto out;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 61047a66fb0c..05c62ca25f74 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -142,6 +142,8 @@ static const char *bch_blkdev_open(const char *path, void *holder,
 	if (IS_ERR(bdev))
 		return "failed to open device";
 
+	bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
+
 	*ret = bdev;
 	return NULL;
 }
@@ -764,9 +766,12 @@ static void cache_set_free(struct closure *cl)
 	bch_io_clock_exit(&c->io_clock[WRITE]);
 	bch_io_clock_exit(&c->io_clock[READ]);
 	bdi_destroy(&c->bdi);
-	bioset_exit(&c->btree_read_bio);
+	free_percpu(c->bio_decompress_worker);
+	mempool_exit(&c->compression_workspace_pool);
+	mempool_exit(&c->bio_bounce_pages);
 	bioset_exit(&c->bio_write);
-	bioset_exit(&c->bio_split);
+	bioset_exit(&c->bio_read);
+	bioset_exit(&c->btree_read_bio);
 	mempool_exit(&c->btree_reserve_pool);
 	mempool_exit(&c->fill_iter);
 	mempool_exit(&c->search);
@@ -893,6 +898,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 {
 	struct cache_set *c;
 	unsigned iter_size;
+	int cpu;
 
 	c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
 	if (!c)
@@ -952,9 +958,9 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	INIT_LIST_HEAD(&c->btree_cache_freeable);
 	INIT_LIST_HEAD(&c->btree_cache_freed);
 
+	mutex_init(&c->bio_bounce_pages_lock);
 	INIT_WORK(&c->bio_submit_work, bch_bio_submit_work);
 	spin_lock_init(&c->bio_submit_lock);
-
 	bio_list_init(&c->read_race_list);
 	spin_lock_init(&c->read_race_lock);
 	INIT_WORK(&c->read_race_work, bch_read_race_work);
@@ -992,9 +998,14 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	    mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
 					BTREE_RESERVE_SIZE) ||
 	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
-	    bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio)) ||
-	    bioset_init(&c->bio_write, 4, offsetof(struct bch_write_bio, bio.bio)) ||
 	    bioset_init(&c->btree_read_bio, 1, offsetof(struct bbio, bio)) ||
+	    bioset_init(&c->bio_read, 4, offsetof(struct bch_read_bio, bio.bio)) ||
+	    bioset_init(&c->bio_write, 4, offsetof(struct bch_write_bio, bio.bio)) ||
+	    mempool_init_page_pool(&c->bio_bounce_pages,
+				   CRC32_EXTENT_SIZE_MAX / PAGE_SECTORS, 0) ||
+	    mempool_init_page_pool(&c->compression_workspace_pool, 1,
+				   get_order(COMPRESSION_WORKSPACE_SIZE)) ||
+	    !(c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker)) ||
 	    bdi_setup_and_register(&c->bdi, "bcache") ||
 	    bch_io_clock_init(&c->io_clock[READ]) ||
 	    bch_io_clock_init(&c->io_clock[WRITE]) ||
@@ -1003,9 +1014,18 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	    bch_bset_sort_state_init(&c->sort, ilog2(btree_pages(c))))
 		goto err;
 
+	for_each_possible_cpu(cpu) {
+		struct bio_decompress_worker *d =
+			per_cpu_ptr(c->bio_decompress_worker, cpu);
+
+		INIT_WORK(&d->work, bch_bio_decompress_work);
+		init_llist_head(&d->bio_list);
+	}
+
 	c->bdi.ra_pages		= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
 	c->bdi.congested_fn	= bch_congested_fn;
 	c->bdi.congested_data	= c;
+	c->bdi.capabilities	|= BDI_CAP_STABLE_WRITES;
 
 	return c;
 err:
diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c
index ed05c6f4d412..62108446b82d 100644
--- a/drivers/md/bcache/tier.c
+++ b/drivers/md/bcache/tier.c
@@ -24,30 +24,23 @@ static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k)
 
 	if (bkey_extent_is_data(k.k)) {
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
 		struct cache_member_rcu *mi;
-		unsigned replicas = CACHE_SET_DATA_REPLICAS_WANT(&c->sb);
-		unsigned dev;
-		bool ret = false;
+		unsigned replicas = 0;
 
-		/*
-		 * Should not happen except in a pathological situation (too
-		 * many pointers on the wrong tier?
-		 */
-		if (bch_extent_ptrs(e) == BKEY_EXTENT_PTRS_MAX)
+		/* Make sure we have room to add a new pointer: */
+		if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_MAX_U64s >
+		    BKEY_EXTENT_VAL_U64s_MAX)
 			return false;
 
-		/*
-		 * Need at least CACHE_SET_DATA_REPLICAS_WANT ptrs not on tier 0
-		 */
-		if (bch_extent_ptrs(e) < replicas)
-			return true;
-
-		dev = e.v->ptr[bch_extent_ptrs(e) - replicas].dev;
 		mi = cache_member_info_get(c);
-		ret = dev < mi->nr_in_set && !CACHE_TIER(&mi->m[dev]);
+		extent_for_each_ptr(e, ptr)
+			if (ptr->dev < mi->nr_in_set &&
+			    CACHE_TIER(&mi->m[ptr->dev]))
+				replicas++;
 		cache_member_info_put();
 
-		return ret;
+		return replicas < CACHE_SET_DATA_REPLICAS_WANT(&c->sb);
 	}
 
 	return false;
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index f03453ee69f1..2ca58a386cdf 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -621,13 +621,11 @@ TRACE_EVENT(bcache_btree_insert_key,
 		__field(u64,		b_bucket		)
 		__field(u64,		b_offset		)
 		__field(u64,		offset			)
-		__field(u64,		bucket			)
 		__field(u32,		b_inode			)
 		__field(u32,		inode			)
 		__field(u32,		size			)
 		__field(u8,		level			)
 		__field(u8,		id			)
-		__field(u8,		cached			)
 		__field(u8,		op			)
 		__field(u8,		insert_done		)
 	),
@@ -638,22 +636,18 @@ TRACE_EVENT(bcache_btree_insert_key,
 		__entry->id		= b->btree_id;
 		__entry->b_inode	= b->key.k.p.inode;
 		__entry->b_offset	= b->key.k.p.offset;
-		__entry->bucket		= PTR_BUCKET_NR_TRACE(b->c, k, 0);
 		__entry->inode		= k->k.p.inode;
 		__entry->offset		= k->k.p.offset;
 		__entry->size		= k->k.size;
-		__entry->cached		= bkey_extent_is_cached(&k->k);
 		__entry->op		= op;
 		__entry->insert_done	= insert_done;
 	),
 
-	TP_printk("%u for %u bucket %llu(%u) id %u: %u:%llu %u:%llu len %u%s -> %llu",
+	TP_printk("%u for %u bucket %llu(%u) id %u: %u:%llu %u:%llu len %u",
 		  __entry->insert_done, __entry->op,
 		  __entry->b_bucket, __entry->level, __entry->id,
 		  __entry->b_inode, __entry->b_offset,
-		  __entry->inode, __entry->offset,
-		  __entry->size, __entry->cached ? " cached" : "",
-		  __entry->bucket)
+		  __entry->inode, __entry->offset, __entry->size)
 );
 
 DECLARE_EVENT_CLASS(btree_split,
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 8ea6758301a7..a5ab2935c146 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -244,19 +244,139 @@ BKEY_VAL_TYPE(cookie,		KEY_TYPE_COOKIE);
 /* Extents */
 
 /*
- * bcache keys index the end of the extent as the offset
- * The end is exclusive, while the start is inclusive
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the start of the data that
+ * is currently live. The size field in struct bkey records the current (live)
+ * size of the extent, and is also used to mean "size of region on disk that we
+ * point to" in this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32	- 0b1
+ * bch_extent_ptr	- 0b10
+ * bch_extent_crc64	- 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
  */
 
+enum bch_extent_entry_type {
+	BCH_EXTENT_ENTRY_crc32		= 0,
+	BCH_EXTENT_ENTRY_ptr		= 1,
+	BCH_EXTENT_ENTRY_crc64		= 2,
+};
+
+#define BCH_EXTENT_ENTRY_MAX		3
+
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u32			type:1,
+				offset:7,
+				compressed_size:8,
+				uncompressed_size:8,
+				csum_type:4,
+				compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u32			csum_type:4,
+				compression_type:4,
+				uncompressed_size:8,
+				compressed_size:8,
+				offset:7,
+				type:1;
+#endif
+	__u32			csum;
+} __attribute__((packed)) __attribute__((aligned(8)));
+
+#define CRC32_EXTENT_SIZE_MAX	(1U << 7)
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:3,
+				compressed_size:18,
+				uncompressed_size:18,
+				offset:17,
+				csum_type:4,
+				compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			csum_type:4,
+				compression_type:4,
+				offset:17,
+				uncompressed_size:18,
+				compressed_size:18,
+				type:3;
+#endif
+	__u64			csum;
+} __attribute__((packed)) __attribute__((aligned(8)));
+
+#define CRC64_EXTENT_SIZE_MAX	(1U << 17)
+
 struct bch_extent_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			gen:8,
+	__u64			type:2,
+				erasure_coded:1,
+				offset:45, /* 16 petabytes */
 				dev:8,
-				offset:48;
+				gen:8;
 #elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			offset:48,
+	__u64			gen:8,
 				dev:8,
-				gen:8;
+				offset:45,
+				erasure_coded:1,
+				type:2;
 #endif
 } __attribute__((packed)) __attribute__((aligned(8)));
 
@@ -264,6 +384,13 @@ struct bch_extent_ptr {
 
 #define PTR_LOST_DEV			255 /* XXX: kill */
 
+union bch_extent_entry {
+	__u8				type;
+	struct bch_extent_crc32		crc32;
+	struct bch_extent_crc64		crc64;
+	struct bch_extent_ptr		ptr;
+};
+
 enum {
 	BCH_EXTENT		= 128,
 
@@ -277,9 +404,10 @@ enum {
 
 struct bch_extent {
 	struct bch_val		v;
-	struct bch_extent_ptr	ptr[0];
+
+	union bch_extent_entry	start[0];
 	__u64			_data[0];
-};
+} __attribute__((packed)) __attribute__((aligned(8)));
 BKEY_VAL_TYPE(extent,		BCH_EXTENT);
 
 /* Inodes */
@@ -552,6 +680,18 @@ enum {
 	BCH_DIRENT_CSUM_SHA1		= 3,
 };
 
+BITMASK(CACHE_DATA_PREFERRED_CSUM_TYPE,	struct cache_sb, flags, 48, 52);
+
+BITMASK(CACHE_COMPRESSION_TYPE,		struct cache_sb, flags, 52, 56);
+enum {
+	BCH_COMPRESSION_NONE		= 0,
+	BCH_COMPRESSION_LZO1X		= 1,
+	BCH_COMPRESSION_GZIP		= 2,
+	BCH_COMPRESSION_XZ		= 3,
+};
+
+/* backing device specific stuff: */
+
 BITMASK(BDEV_CACHE_MODE,		struct cache_sb, flags, 0, 4);
 #define CACHE_MODE_WRITETHROUGH		0U
 #define CACHE_MODE_WRITEBACK		1U
author	Kent Overstreet <kent.overstreet@gmail.com>	2015-05-28 23:57:20 -0700
committer	Kent Overstreet <kent.overstreet@gmail.com>	2016-10-07 12:34:20 -0800
commit	5749e6138348d7c1546e28b4ac0ae9032c94e0c0 (patch)
tree	ac6f6ce3dced78b075c0b54ba5e0ad9a728a9951
parent	9d5c579320bcf93315155a30a1b6b975f2811468 (diff)