diff options
-rw-r--r-- | drivers/md/bcache/Kconfig | 2 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.c | 95 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.h | 6 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 9 | ||||
-rw-r--r-- | drivers/md/bcache/bset.h | 26 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 11 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/buckets.c | 15 | ||||
-rw-r--r-- | drivers/md/bcache/buckets.h | 15 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/extents.c | 680 | ||||
-rw-r--r-- | drivers/md/bcache/extents.h | 290 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 1067 | ||||
-rw-r--r-- | drivers/md/bcache/io.h | 55 | ||||
-rw-r--r-- | drivers/md/bcache/io_types.h | 12 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 31 | ||||
-rw-r--r-- | drivers/md/bcache/migrate.c | 8 | ||||
-rw-r--r-- | drivers/md/bcache/move.c | 14 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 71 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 30 | ||||
-rw-r--r-- | drivers/md/bcache/tier.c | 27 | ||||
-rw-r--r-- | include/trace/events/bcache.h | 10 | ||||
-rw-r--r-- | include/uapi/linux/bcache.h | 156 |
23 files changed, 1983 insertions, 651 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 55e135f6dd61..0f9410c06c45 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -3,6 +3,8 @@ config BCACHE tristate "Block device as cache" select LIBCRC32C select FS_POSIX_ACL + select ZLIB_INFLATE + select ZLIB_DEFLATE ---help--- Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index d567f4ae6df3..7b51888c5968 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -1271,59 +1271,43 @@ static void verify_not_stale(struct cache_set *c, const struct open_bucket *ob) } /* - * Allocates some space in the cache to write to, and k to point to the newly - * allocated space, and updates k->size and k->offset (to point to the - * end of the newly allocated space). - * - * May allocate fewer sectors than @sectors, k->size indicates how many - * sectors were actually allocated. - * - * Return codes: - * - -EAGAIN: closure was added to waitlist - * - -ENOSPC: out of space and no closure provided - * - * @c - cache set. - * @wp - write point to use for allocating sectors. - * @k - key to return the allocated space information. - * @cl - closure to wait for a bucket + * Get us an open_bucket we can allocate from, return with it locked: */ -struct open_bucket *bch_alloc_sectors(struct cache_set *c, - struct write_point *wp, - struct bkey_i *k, - bool check_enospc, - struct closure *cl) +struct open_bucket *bch_alloc_sectors_start(struct cache_set *c, + struct write_point *wp, + bool check_enospc, + struct closure *cl) { - struct bkey_s_extent dst; - struct bch_extent_ptr *ptr; struct open_bucket *ob; - struct cache *ca; - unsigned sectors; ob = lock_and_refill_writepoint(c, wp, check_enospc, cl); if (IS_ERR_OR_NULL(ob)) return ob; BUG_ON(!ob->sectors_free); - verify_not_stale(c, ob); + return ob; +} + +/* + * Append pointers to the space we just allocated to @k, and mark @sectors space + * as allocated out of @ob + */ +void bch_alloc_sectors_done(struct cache_set *c, struct write_point *wp, + struct bkey_i *k, struct open_bucket *ob, + unsigned sectors) +{ + struct bch_extent_ptr *ptr; + struct cache *ca; + unsigned i; + /* * We're keeping any existing pointer k has, and appending new pointers: * __bch_write() will only write to the pointers we add here: */ - dst = bkey_i_to_s_extent(k); - - /* Set up the pointer to the space we're allocating: */ - memcpy(&dst.v->ptr[bch_extent_ptrs(dst)], - ob->ptrs, ob->nr_ptrs * sizeof(u64)); - - bch_set_extent_ptrs(dst, bch_extent_ptrs(dst) + ob->nr_ptrs); - - sectors = min_t(unsigned, dst.k->size, ob->sectors_free); - - bch_key_resize(dst.k, sectors); - - /* update open bucket for next time: */ + for (i = 0; i < ob->nr_ptrs; i++) + extent_ptr_append(bkey_i_to_extent(k), ob->ptrs[i]); ob->sectors_free -= sectors; if (ob->sectors_free) @@ -1341,6 +1325,41 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c, rcu_read_unlock(); mutex_unlock(&ob->lock); +} + +/* + * Allocates some space in the cache to write to, and k to point to the newly + * allocated space, and updates k->size and k->offset (to point to the + * end of the newly allocated space). + * + * May allocate fewer sectors than @sectors, k->size indicates how many + * sectors were actually allocated. + * + * Return codes: + * - -EAGAIN: closure was added to waitlist + * - -ENOSPC: out of space and no closure provided + * + * @c - cache set. + * @wp - write point to use for allocating sectors. + * @k - key to return the allocated space information. + * @cl - closure to wait for a bucket + */ +struct open_bucket *bch_alloc_sectors(struct cache_set *c, + struct write_point *wp, + struct bkey_i *k, + bool check_enospc, + struct closure *cl) +{ + struct open_bucket *ob; + + ob = bch_alloc_sectors_start(c, wp, check_enospc, cl); + if (IS_ERR_OR_NULL(ob)) + return ob; + + if (k->k.size > ob->sectors_free) + bch_key_resize(&k->k, ob->sectors_free); + + bch_alloc_sectors_done(c, wp, k, ob, k->k.size); return ob; } diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h index 0ab405a19da9..c0118db8440e 100644 --- a/drivers/md/bcache/alloc.h +++ b/drivers/md/bcache/alloc.h @@ -16,6 +16,12 @@ void bch_prio_timer_start(struct cache_set *, int); void bch_open_bucket_put(struct cache_set *, struct open_bucket *); +struct open_bucket *bch_alloc_sectors_start(struct cache_set *, + struct write_point *, + bool, struct closure *); +void bch_alloc_sectors_done(struct cache_set *, struct write_point *, + struct bkey_i *, struct open_bucket *, unsigned); + struct open_bucket *bch_alloc_sectors(struct cache_set *, struct write_point *, struct bkey_i *, bool, struct closure *); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 807278e80500..a160f5946c6e 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -213,6 +213,7 @@ #include "blockdev_types.h" #include "buckets_types.h" #include "clock_types.h" +#include "io_types.h" #include "journal_types.h" #include "keylist_types.h" #include "keybuf_types.h" @@ -418,8 +419,6 @@ struct cache_set { struct closure sb_write; struct semaphore sb_write_mutex; - struct bio_set bio_split; - struct backing_dev_info bdi; /* BTREE CACHE */ @@ -563,7 +562,13 @@ struct cache_set { struct rw_semaphore gc_lock; /* IO PATH */ + struct bio_set bio_read; struct bio_set bio_write; + struct mutex bio_bounce_pages_lock; + mempool_t bio_bounce_pages; + mempool_t compression_workspace_pool; + struct bio_decompress_worker __percpu + *bio_decompress_worker; /* For punting bio submissions to workqueue, io.c */ struct bio_list bio_submit_list; diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 1cb60b65322a..4d8fb84d8c55 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -324,10 +324,30 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n, /* Bkey utility code */ -#define BKEY_EXTENT_PTRS_MAX 4 -#define BKEY_EXTENT_MAX_U64s (BKEY_U64s + BKEY_EXTENT_PTRS_MAX) +/* Amount of space we might need, in order to add a single pointer */ -#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_PTRS_MAX) +/* XXX: move constants to uapi/linux/bcache.h */ + +#define BKEY_EXTENT_PTR_MAX_U64s \ + ((sizeof(struct bch_extent_crc64) + \ + sizeof(struct bch_extent_ptr)) / sizeof(u64)) + +#define BKEY_EXTENT_PTRS_MAX 4 + +#if 0 +#define BKEY_EXTENT_VAL_U64s_MAX \ + ((sizeof(struct bch_extent_crc64) + + sizeof(struct bch_extent_ptr)) * BKEY_EXTENT_PTRS_MAX) +#else +#define BKEY_EXTENT_VAL_U64s_MAX 8 +#endif + +#define BKEY_EXTENT_MAX_U64s (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) + +#define BKEY_BTREE_PTR_VAL_U64s_MAX BKEY_EXTENT_PTRS_MAX +#define BKEY_BTREE_PTR_U64s_MAX (BKEY_U64s + BKEY_EXTENT_PTRS_MAX) + +#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) #define __bkey_idx(_set, _offset) \ ((_set)->_data + (_offset)) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index eac7354c572f..e422c4b2b0e2 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -335,6 +335,12 @@ static void bch_btree_init_next(struct cache_set *c, struct btree *b, /* Btree IO */ +/* + * We seed the checksum with the entire first pointer (dev, gen and offset), + * since for btree nodes we have to store the checksum with the data instead of + * the pointer - this helps guard against reading a valid btree node that is not + * the node we actually wanted: + */ #define btree_csum_set(_b, _i) \ ({ \ void *_data = (void *) (_i) + 8; \ @@ -573,7 +579,7 @@ static void bch_btree_node_read(struct cache_set *c, struct btree *b) bch_bio_map(bio, b->data); bio_get(bio); - bch_submit_bbio(to_bbio(bio), pick.ca, &b->key, &pick.ptr, true); + bch_submit_bbio(to_bbio(bio), pick.ca, &pick.ptr, true); closure_sync(&cl); @@ -2351,8 +2357,7 @@ struct btree_split_state { * pointers never have crc/compression info, so we only need to acount * for the pointers for three keys */ - u64 inline_keys[(BKEY_U64s + - BKEY_EXTENT_PTRS_MAX) * 3]; + u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; struct btree_reserve *reserve; }; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index ec467b1b56d6..835d4bcf166b 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -101,7 +101,7 @@ struct btree { struct rhash_head hash; /* Key/pointer for this btree node */ - BKEY_PADDED(key); + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); /* Single bit - set when accessed, cleared by shrinker */ unsigned long accessed; diff --git a/drivers/md/bcache/buckets.c b/drivers/md/bcache/buckets.c index 3d92c5db96d3..3c96ce502c10 100644 --- a/drivers/md/bcache/buckets.c +++ b/drivers/md/bcache/buckets.c @@ -297,7 +297,7 @@ int bch_mark_pointers(struct cache_set *c, struct btree *b, struct bkey_s_c_extent e, int sectors, bool fail_if_stale, bool metadata) { - const struct bch_extent_ptr *ptr; + const struct bch_extent_ptr *ptr, *ptr2; struct cache *ca; BUG_ON(metadata && bkey_extent_is_cached(e.k)); @@ -350,11 +350,14 @@ int bch_mark_pointers(struct cache_set *c, struct btree *b, return 0; stale: - while (--ptr >= e.v->ptr) - if ((ca = PTR_CACHE(c, ptr))) - bch_mark_bucket(c, ca, b, ptr, -sectors, - bch_extent_ptr_is_dirty(c, e, ptr), - metadata); + extent_for_each_online_device(c, e, ptr2, ca) { + if (ptr2 == ptr) + break; + + bch_mark_bucket(c, ca, b, ptr, -sectors, + bch_extent_ptr_is_dirty(c, e, ptr), + metadata); + } rcu_read_unlock(); return -1; diff --git a/drivers/md/bcache/buckets.h b/drivers/md/bcache/buckets.h index a2b8e479f80c..5e191ef91812 100644 --- a/drivers/md/bcache/buckets.h +++ b/drivers/md/bcache/buckets.h @@ -48,19 +48,20 @@ static inline size_t PTR_BUCKET_NR_TRACE(const struct cache_set *c, const struct bkey_i *k, unsigned ptr) { - const struct cache *ca; size_t bucket = 0; - +#if 0 if (bkey_extent_is_data(&k->k)) { - const struct bkey_i_extent *e = bkey_i_to_extent_c(k); - const struct bch_extent_ptr *p = &e->v.ptr[ptr]; + const struct bch_extent_ptr *ptr; + const struct cache *ca; rcu_read_lock(); - if ((ca = PTR_CACHE(c, p))) - bucket = PTR_BUCKET_NR(ca, p); + extent_for_each_online_device(c, bkey_i_to_s_c_extent(k), ptr, ca) { + bucket = PTR_BUCKET_NR(ca, ptr); + break; + } rcu_read_unlock(); } - +#endif return bucket; } diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index bb4d715c9b15..99d4657c4f4a 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -74,7 +74,7 @@ void bch_btree_verify(struct cache_set *c, struct btree *b) bio->bi_end_io = btree_verify_endio; bch_bio_map(bio, n_sorted); - bch_submit_bbio(to_bbio(bio), pick.ca, &b->key, &pick.ptr, true); + bch_submit_bbio(to_bbio(bio), pick.ca, &pick.ptr, true); closure_sync(&cl); bio_put(bio); diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 140b7a9fed3f..4d5889d6d107 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -162,42 +162,104 @@ bool bch_extent_has_device(struct bkey_s_c_extent e, unsigned dev) return false; } -static bool should_drop_ptr(const struct cache_set *c, - struct bkey_s_c_extent e, - const struct bch_extent_ptr *ptr) +/* returns true if equal */ +static bool crc_cmp(union bch_extent_entry *l, union bch_extent_entry *r) { - struct cache *ca; - struct cache_member *mi; + return extent_entry_type(l) == extent_entry_type(r) && + !memcmp(l, r, extent_entry_bytes(l)); +} - if (ptr->dev == PTR_LOST_DEV) - return false; +/* Increment pointers after @crc by crc's offset until the next crc entry: */ +void extent_adjust_pointers(struct bkey_s_extent e, union bch_extent_entry *crc) +{ + union bch_extent_entry *entry; + unsigned offset = crc_to_64((void *) crc).offset; - if (ptr->dev >= c->sb.nr_in_set) - return true; + extent_for_each_entry_from(e, entry, extent_entry_next(crc)) { + if (!extent_entry_is_ptr(entry)) + return; - mi = rcu_dereference(c->members)->m; + entry->ptr.offset += offset; + } +} - if (bch_is_zero(mi[ptr->dev].uuid.b, sizeof(uuid_le))) - return true; +static void extent_cleanup_crcs(struct bkey_s_extent e) +{ + union bch_extent_entry *crc = e.v->start, *prev = NULL; - if (bch_extent_ptr_is_dirty(c, e, ptr)) - return false; + while (crc != extent_entry_last(e)) { + union bch_extent_entry *next = extent_entry_next(crc); + size_t crc_u64s = extent_entry_u64s(crc); - return (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr); + if (!extent_entry_is_crc(crc)) + goto next; + + if (next != extent_entry_last(e) && + extent_entry_is_crc(next)) { + /* + * Two crc entries right after the other, the first one + * doesn't have any pointers and we can just drop it: + */ + goto drop; + } + + if (prev && crc_cmp(crc, prev)) { + /* + * This crc entry is identical to the previous one, drop + * it: + */ + goto drop; + } + + if (!prev && + !crc_to_64((void *) crc).csum_type && + !crc_to_64((void *) crc).compression_type){ + extent_adjust_pointers(e, crc); + goto drop; + } + + prev = crc; +next: + crc = next; + continue; +drop: + memmove(crc, next, + (void *) extent_entry_last(e) - (void *) next); + e.k->u64s -= crc_u64s; + } } -void bch_extent_drop_stale(struct cache_set *c, struct bkey_s k) +void bch_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) { - struct bkey_s_extent e = bkey_s_to_extent(k); - struct bch_extent_ptr *ptr; + __bch_extent_drop_ptr(e, ptr); + extent_cleanup_crcs(e); +} - rcu_read_lock(); +static bool should_drop_ptr(const struct cache_set *c, + struct bkey_s_c_extent e, + const struct bch_extent_ptr *ptr) +{ + struct cache *ca; + + return (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr); +} - extent_for_each_ptr_backwards(e, ptr) - if (should_drop_ptr(c, e.c, ptr)) - bch_extent_drop_ptr(e, ptr); +void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent e) +{ + struct bch_extent_ptr *ptr = &e.v->start->ptr; + bool dropped = false; + rcu_read_lock(); + while ((ptr = extent_ptr_next(e, ptr))) + if (should_drop_ptr(c, e.c, ptr)) { + __bch_extent_drop_ptr(e, ptr); + dropped = true; + } else + ptr++; rcu_read_unlock(); + + if (dropped) + extent_cleanup_crcs(e); } static bool bch_ptr_normalize(struct btree_keys *bk, struct bkey_s k) @@ -207,145 +269,123 @@ static bool bch_ptr_normalize(struct btree_keys *bk, struct bkey_s k) return bch_extent_normalize(b->c, k); } -/* - * Common among btree pointers and normal data extents - */ -static bool __ptr_invalid(const struct cache_set *c, struct bkey_s_c k) +static const char *extent_ptr_invalid(const struct cache_member_rcu *mi, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk) { - struct bkey_s_c_extent e; - const struct bch_extent_ptr *ptr; - struct cache_member *mi; - bool ret = true; + const struct cache_member *m = mi->m + ptr->dev; - if (k.k->u64s < BKEY_U64s) - return true; + if (ptr->dev == PTR_LOST_DEV) /* XXX: kill */ + return NULL; - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); - - if (bch_extent_ptrs(e) > BKEY_EXTENT_PTRS_MAX) - return true; + if (ptr->dev > mi->nr_in_set || + bch_is_zero(m->uuid.b, sizeof(uuid_le))) + return "pointer to invalid device"; - mi = cache_member_info_get(c)->m; + if (ptr->offset + size_ondisk > m->bucket_size * m->nbuckets) + return "offset past end of device"; - extent_for_each_ptr(e, ptr) { - struct cache_member *m = mi + ptr->dev; - - if (ptr->dev > c->sb.nr_in_set) { - if (ptr->dev != PTR_LOST_DEV) - goto invalid; - - continue; - } + if (ptr->offset < m->bucket_size * m->first_bucket) + return "offset before first bucket"; - if ((ptr->offset + e.k->size > - m->bucket_size * m->nbuckets) || - (ptr->offset < - m->bucket_size * m->first_bucket) || - ((ptr->offset & (m->bucket_size - 1)) + e.k->size > - m->bucket_size)) - goto invalid; - } - - ret = false; -invalid: - cache_member_info_put(); - break; - default: - return true; - } + if ((ptr->offset & (m->bucket_size - 1)) + size_ondisk > m->bucket_size) + return "spans multiple buckets"; - return ret; + return NULL; } -/* - * Should match __extent_invalid() - returns the reason an extent is invalid - */ -static const char *bch_ptr_status(const struct cache_set *c, - struct cache_member *mi, - struct bkey_s_c_extent e) +static size_t extent_print_ptrs(struct cache_set *c, char *buf, + size_t size, struct bkey_s_c_extent e) { + char *out = buf, *end = buf + size; + const union bch_extent_entry *entry; const struct bch_extent_ptr *ptr; + struct bch_extent_crc64 crc; + struct cache *ca; + bool first = true; - if (!bch_extent_ptrs(e)) - return "invalid: no pointers"; - - if (bch_extent_ptrs(e) > BKEY_EXTENT_PTRS_MAX) - return "invalid: too many pointers"; - - extent_for_each_ptr(e, ptr) { - struct cache_member *m = mi + ptr->dev; - struct cache *ca; - - if (ptr->dev > c->sb.nr_in_set) { - if (ptr->dev != PTR_LOST_DEV) - return "pointer to invalid device"; +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - continue; + rcu_read_lock(); + extent_for_each_entry(e, entry) { + if (!first) + p(" "); + + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + crc = crc_to_64((void *) entry); + p("crc: c_size %u size %u offset %u csum %u compress %u", + crc.compressed_size, crc.uncompressed_size, + crc.offset, crc.csum_type, crc.compression_type); + break; + case BCH_EXTENT_ENTRY_ptr: + ptr = &entry->ptr; + p("ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr) + ? " stale" : ""); + break; } - if (ptr->offset + e.k->size > m->bucket_size * m->nbuckets) - return "invalid: offset past end of device"; - - if (ptr->offset < m->bucket_size * m->first_bucket) - return "invalid: offset before first bucket"; - - if ((ptr->offset & (m->bucket_size - 1)) + - e.k->size > m->bucket_size) - return "invalid: spans multiple buckets"; - - if ((ca = PTR_CACHE(c, ptr)) && - ptr_stale(ca, ptr)) - return "stale"; + first = false; } + rcu_read_unlock(); - if (!e.k->size) - return "zeroed key"; - return ""; + if (bkey_extent_is_cached(e.k)) + p(" cached"); +#undef p + return out - buf; } -static void bch_extent_to_text(struct cache_set *c, char *buf, - size_t size, struct bkey_s_c k) +/* Btree ptrs */ + +static const char *bch_btree_ptr_invalid_reason(const struct cache_set *c, + struct bkey_s_c k) { - struct bkey_s_c_extent e; - char *out = buf, *end = buf + size; - const struct bch_extent_ptr *ptr; + if (bkey_extent_is_cached(k.k)) + return "cached"; -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + if (k.k->size) + return "nonzero key size"; - if (bkey_extent_is_data(k.k)) { - e = bkey_s_c_to_extent(k); + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; - extent_for_each_ptr(e, ptr) { - if (ptr != e.v->ptr) - p(", "); + switch (k.k->type) { + case BCH_EXTENT: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr; + const union bch_extent_crc *crc; + struct cache_member_rcu *mi = cache_member_info_get(c); + const char *reason; - p("%u:%llu gen %u", ptr->dev, - (u64) ptr->offset, ptr->gen); - } + extent_for_each_ptr_crc(e, ptr, crc) { + reason = extent_ptr_invalid(mi, ptr, + CACHE_BTREE_NODE_SIZE(&c->sb)); - if (bkey_extent_is_cached(e.k)) - p(" cached"); -#if 0 - if (KEY_CSUM(k)) - p(" cs%llu %llx", KEY_CSUM(k), k->val[1]); -#endif + if (reason) { + cache_member_info_put(); + return reason; + } + } - p(" %s", bch_ptr_status(c, cache_member_info_get(c)->m, e)); cache_member_info_put(); + + if (crc) + return "has crc field"; + + return NULL; } -#undef p -} -/* Btree ptrs */ + default: + return "invalid value type"; + } +} static bool bch_btree_ptr_invalid(const struct cache_set *c, struct bkey_s_c k) { - return bkey_extent_is_cached(k.k) || - k.k->size || - __ptr_invalid(c, k); + return bch_btree_ptr_invalid_reason(c, k); } static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b, @@ -358,41 +398,49 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b, char buf[160]; struct bucket *g; struct cache *ca; + unsigned replicas = 0; bool bad; - if (bch_extent_ptrs(e) < CACHE_SET_META_REPLICAS_HAVE(&c->sb)) { - bch_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), k); - cache_set_bug(c, - "btree key bad (too few replicas, %u < %llu): %s", - bch_extent_ptrs(e), - CACHE_SET_META_REPLICAS_HAVE(&c->sb), - buf); + if (bkey_extent_is_cached(k.k)) { + btree_bug(b, "btree ptr marked as cached"); return; } rcu_read_lock(); extent_for_each_online_device(c, e, ptr, ca) { - g = PTR_BUCKET(ca, ptr); + replicas++; - err = "stale"; - if (ptr_stale(ca, ptr)) - goto err; + if ((ca = PTR_CACHE(c, ptr))) { + g = PTR_BUCKET(ca, ptr); - do { - seq = read_seqcount_begin(&c->gc_cur_lock); - bad = (!__gc_will_visit_node(c, b) && - !g->mark.is_metadata); - } while (read_seqcount_retry(&c->gc_cur_lock, seq)); + err = "stale"; + if (ptr_stale(ca, ptr)) + goto err; - err = "inconsistent"; - if (bad) - goto err; + do { + seq = read_seqcount_begin(&c->gc_cur_lock); + bad = (!__gc_will_visit_node(c, b) && + !g->mark.is_metadata); + } while (read_seqcount_retry(&c->gc_cur_lock, seq)); + + err = "inconsistent"; + if (bad) + goto err; + } } rcu_read_unlock(); + if (replicas < CACHE_SET_META_REPLICAS_HAVE(&c->sb)) { + bch_bkey_val_to_text(c, btree_node_type(b), + buf, sizeof(buf), k); + cache_set_bug(c, + "btree key bad (too few replicas, %u < %llu): %s", + replicas, CACHE_SET_META_REPLICAS_HAVE(&c->sb), buf); + return; + } + return; err: bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); @@ -404,16 +452,43 @@ err: rcu_read_unlock(); } +static void bch_btree_ptr_to_text(struct cache_set *c, char *buf, + size_t size, struct bkey_s_c k) +{ + char *out = buf, *end = buf + size; + const char *invalid; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + if (bkey_extent_is_data(k.k)) + out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); + + invalid = bch_btree_ptr_invalid_reason(c, k); + if (invalid) + p(" invalid: %s", invalid); +#undef p +} + struct extent_pick_ptr bch_btree_pick_ptr(struct cache_set *c, const struct btree *b) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); + union bch_extent_crc *crc; const struct bch_extent_ptr *ptr; struct cache *ca; rcu_read_lock(); - extent_for_each_online_device(c, e, ptr, ca) { + extent_for_each_online_device_crc(c, e, crc, ptr, ca) { + if (crc) { + bch_cache_error(ca, + "btree node pointer with crc at btree %u level %u/%u bucket %zu", + b->btree_id, b->level, btree_node_root(b) + ? btree_node_root(b)->level : -1, + PTR_BUCKET_NR(ca, ptr)); + break; + } + if (ptr_stale(ca, ptr)) { bch_cache_error(ca, "stale btree node pointer at btree %u level %u/%u bucket %zu", @@ -440,7 +515,7 @@ const struct btree_keys_ops bch_btree_interior_node_ops = { const struct bkey_ops bch_bkey_btree_ops = { .key_invalid = bch_btree_ptr_invalid, .key_debugcheck = btree_ptr_debugcheck, - .val_to_text = bch_extent_to_text, + .val_to_text = bch_btree_ptr_to_text, }; /* Extents */ @@ -467,9 +542,24 @@ bool __bch_cut_front(struct bpos where, struct bkey_s k) else if (bkey_extent_is_data(k.k)) { struct bkey_s_extent e = bkey_s_to_extent(k); struct bch_extent_ptr *ptr; - - extent_for_each_ptr(e, ptr) - ptr->offset += e.k->size - len; + union bch_extent_crc *crc, *prev_crc = NULL; + + extent_for_each_ptr_crc(e, ptr, crc) { + switch (bch_extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + ptr->offset += e.k->size - len; + break; + case BCH_EXTENT_CRC32: + if (prev_crc != crc) + crc->crc32.offset += e.k->size - len; + break; + case BCH_EXTENT_CRC64: + if (prev_crc != crc) + crc->crc64.offset += e.k->size - len; + break; + } + prev_crc = crc; + } } k.k->size = len; @@ -765,8 +855,8 @@ static void bch_drop_subtract(struct cache_set *c, struct btree *b, static bool bkey_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r) { struct bkey_s_c_extent le, re; + const struct bch_extent_ptr *lp, *rp; s64 offset; - unsigned i; BUG_ON(!l.k->size || !r.k->size); @@ -803,12 +893,17 @@ static bool bkey_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r) * pointer */ - if (bch_extent_ptrs(le) != bch_extent_ptrs(re)) + if (bkey_val_u64s(le.k) != bkey_val_u64s(re.k)) return false; - for (i = 0; i < bch_extent_ptrs(le); i++) { - const struct bch_extent_ptr *lp = le.v->ptr + i; - const struct bch_extent_ptr *rp = re.v->ptr + i; + extent_for_each_ptr(le, lp) { + const union bch_extent_entry *entry = + bkey_idx(re.v, (u64 *) lp - le.v->_data); + + if (!extent_entry_is_ptr(entry)) + return false; + + rp = &entry->ptr; if (lp->offset != rp->offset + offset || lp->dev != rp->dev || @@ -1211,10 +1306,63 @@ out: return inserted; } +static const char *bch_extent_invalid_reason(const struct cache_set *c, + struct bkey_s_c k) +{ + if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) + return "value too big"; + + if (!k.k->size) + return "zero key size"; + + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct bch_extent_crc64 crc64; + struct cache_member_rcu *mi = cache_member_info_get(c); + unsigned size_ondisk = e.k->size; + const char *reason; + + extent_for_each_entry(e, entry) + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + crc64 = crc_to_64((void *) entry); + + reason = "checksum uncompressed size < key size"; + if (crc64.uncompressed_size < e.k->size) + goto invalid; + + reason = "checksum offset > uncompressed size"; + if (crc64.offset >= crc64.uncompressed_size) + goto invalid; + + size_ondisk = crc64.compressed_size; + break; + case BCH_EXTENT_ENTRY_ptr: + reason = extent_ptr_invalid(mi, &entry->ptr, size_ondisk); + if (reason) + goto invalid; + break; + } + + cache_member_info_put(); + return NULL; +invalid: + cache_member_info_put(); + return reason; + } + + default: + return "invalid value type"; + } +} + static bool bch_extent_invalid(const struct cache_set *c, struct bkey_s_c k) { - return (bkey_extent_is_data(k.k) && !k.k->size) || - __ptr_invalid(c, k); + return bch_extent_invalid_reason(c, k); } static void bch_extent_debugcheck(struct cache_set *c, struct btree *b, @@ -1229,27 +1377,17 @@ static void bch_extent_debugcheck(struct cache_set *c, struct btree *b, char buf[160]; bool bad; unsigned ptrs_per_tier[CACHE_TIERS]; - unsigned i, tier, replicas; + unsigned i, tier, replicas = 0; memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier)); - if (!bkey_extent_is_cached(e.k) && - bch_extent_ptrs(e) < CACHE_SET_DATA_REPLICAS_HAVE(&c->sb)) { - bch_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), k); - cache_set_bug(c, - "extent key bad (too few replicas, %u < %llu): %s", - bch_extent_ptrs(e), - CACHE_SET_DATA_REPLICAS_HAVE(&c->sb), - buf); - return; - } - mi = cache_member_info_get(c); extent_for_each_ptr(e, ptr) { bool dirty = bch_extent_ptr_is_dirty(c, e, ptr); + replicas++; + /* Could be a special pointer such as PTR_CHECK_DEV */ if (ptr->dev >= mi->nr_in_set) { if (ptr->dev != PTR_LOST_DEV) @@ -1299,55 +1437,165 @@ static void bch_extent_debugcheck(struct cache_set *c, struct btree *b, goto bad_ptr; } } + cache_member_info_put(); + + if (!bkey_extent_is_cached(e.k) && + replicas < CACHE_SET_DATA_REPLICAS_HAVE(&c->sb)) { + bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); + cache_set_bug(c, + "extent key bad (too few replicas, %u < %llu): %s", + replicas, CACHE_SET_DATA_REPLICAS_HAVE(&c->sb), buf); + return; + } - replicas = CACHE_SET_DATA_REPLICAS_WANT(&c->sb); + /* + * XXX: _why_ was this added? + */ for (i = 0; i < CACHE_TIERS; i++) - if (ptrs_per_tier[i] > replicas) { - bch_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), k); + if (ptrs_per_tier[i] > CACHE_SET_DATA_REPLICAS_WANT(&c->sb)) { + bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); cache_set_bug(c, "extent key bad (too many tier %u replicas): %s", i, buf); break; } - cache_member_info_put(); return; bad_device: bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); - cache_set_bug(c, "extent pointer %u device missing: %s", - (unsigned) (ptr - e.v->ptr), buf); + cache_set_bug(c, "extent pointer to dev %u missing device: %s", + ptr->dev, buf); cache_member_info_put(); return; bad_ptr: bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); - cache_set_bug(c, "extent pointer %u bad gc mark: %s:\nbucket %zu prio %i " + cache_set_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i " "gen %i last_gc %i mark 0x%08x", - (unsigned) (ptr - e.v->ptr), buf, PTR_BUCKET_NR(ca, ptr), + buf, PTR_BUCKET_NR(ca, ptr), g->read_prio, PTR_BUCKET_GEN(ca, ptr), g->oldest_gen, g->mark.counter); cache_member_info_put(); return; } +static void bch_extent_to_text(struct cache_set *c, char *buf, + size_t size, struct bkey_s_c k) +{ + char *out = buf, *end = buf + size; + const char *invalid; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + if (bkey_extent_is_data(k.k)) + out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); + + invalid = bch_extent_invalid_reason(c, k); + if (invalid) + p(" invalid: %s", invalid); +#undef p +} + static unsigned PTR_TIER(struct cache_member_rcu *mi, - const struct bch_extent *e, - unsigned ptr) + const struct bch_extent_ptr *ptr) +{ + return ptr->dev < mi->nr_in_set + ? CACHE_TIER(&mi->m[ptr->dev]) + : UINT_MAX; +} + +static void __extent_sort_ptrs(struct cache_member_rcu *mi, + struct bkey_s_extent src) +{ + struct bch_extent_ptr *src_ptr, *dst_ptr; + union bch_extent_entry *src_crc, *dst_crc; + BKEY_PADDED(k) tmp; + struct bkey_s_extent dst; + size_t u64s, crc_u64s; + u64 *p; + + /* + * Insertion sort: + * + * Note: this sort needs to be stable, because pointer order determines + * pointer dirtyness. + */ + + tmp.k.k = *src.k; + dst = bkey_i_to_s_extent(&tmp.k); + set_bkey_val_u64s(dst.k, 0); + + extent_for_each_ptr_crc(src, src_ptr, src_crc) { + extent_for_each_ptr_crc(dst, dst_ptr, dst_crc) + if (PTR_TIER(mi, src_ptr) < PTR_TIER(mi, dst_ptr)) + break; + + /* found insert position: */ + + /* + * we're making sure everything has a crc at this point, if + * dst_ptr points to a pointer it better have a crc: + */ + BUG_ON(dst_ptr != &extent_entry_last(dst)->ptr && !dst_crc); + BUG_ON(dst_crc && extent_entry_next(dst_crc) != (void *) dst_ptr); + + p = dst_ptr != &extent_entry_last(dst)->ptr + ? (void *) dst_crc + : (void *) dst_ptr; + + if (!src_crc) + src_crc = (void *) &((struct bch_extent_crc32) { + .type = 1 << BCH_EXTENT_ENTRY_crc32, + .compressed_size = src.k->size, + .uncompressed_size = src.k->size, + .offset = 0, + .compression_type = BCH_COMPRESSION_NONE, + .csum_type = BCH_CSUM_NONE, + .csum = 0, + }); + + crc_u64s = extent_entry_u64s((void *) src_crc); + u64s = crc_u64s + sizeof(*dst_ptr) / sizeof(u64); + + memmove(p + u64s, p, + (void *) extent_entry_last(dst) - (void *) p); + set_bkey_val_u64s(dst.k, bkey_val_u64s(dst.k) + u64s); + + memcpy(p, src_crc, crc_u64s * sizeof(u64)); + memcpy(p + crc_u64s, src_ptr, sizeof(*src_ptr)); + } + + /* Sort done - now drop redundant crc entries: */ + extent_cleanup_crcs(dst); + + memcpy(src.v, dst.v, bkey_val_bytes(dst.k)); + set_bkey_val_u64s(src.k, bkey_val_u64s(dst.k)); +} + +static void extent_sort_ptrs(struct cache_set *c, struct bkey_s_extent e) { - unsigned dev = e->ptr[ptr].dev; + struct cache_member_rcu *mi; + struct bch_extent_ptr *ptr, *prev = NULL; + union bch_extent_crc *crc; + + mi = cache_member_info_get(c); + + extent_for_each_ptr_crc(e, ptr, crc) + if (prev && + PTR_TIER(mi, ptr) < PTR_TIER(mi, prev)) { + __extent_sort_ptrs(mi, e); + break; + } - return dev < mi->nr_in_set ? CACHE_TIER(&mi->m[dev]) : UINT_MAX; + cache_member_info_put(); } bool bch_extent_normalize(struct cache_set *c, struct bkey_s k) { struct bkey_s_extent e; struct bch_extent_ptr *ptr; - struct cache_member_rcu *mi; - unsigned i; - bool swapped, have_data = false; + bool have_data = false; switch (k.k->type) { case KEY_TYPE_ERROR: @@ -1364,31 +1612,15 @@ bool bch_extent_normalize(struct cache_set *c, struct bkey_s k) case BCH_EXTENT_CACHED: e = bkey_s_to_extent(k); - - bch_extent_drop_stale(c, k); - - mi = cache_member_info_get(c); - - /* Bubble sort pointers by tier, lowest (fastest) tier first */ - do { - swapped = false; - for (i = 0; i + 1 < bch_extent_ptrs(e); i++) { - if (PTR_TIER(mi, e.v, i) > - PTR_TIER(mi, e.v, i + 1)) { - swap(e.v->ptr[i], e.v->ptr[i + 1]); - swapped = true; - } - } - } while (swapped); - - cache_member_info_put(); + bch_extent_drop_stale(c, e); + extent_sort_ptrs(c, e); extent_for_each_ptr(e, ptr) if (ptr->dev != PTR_LOST_DEV) have_data = true; if (!have_data) { - bch_set_extent_ptrs(e, 0); + set_bkey_val_u64s(e.k, 0); if (bkey_extent_is_cached(e.k)) { k.k->type = KEY_TYPE_DISCARD; if (!k.k->version) @@ -1417,6 +1649,7 @@ bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k, struct cache *avoid) { struct bkey_s_c_extent e; + const union bch_extent_crc *crc; const struct bch_extent_ptr *ptr; struct cache *ca; struct extent_pick_ptr ret = { .ca = NULL }; @@ -1439,9 +1672,10 @@ bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k, e = bkey_s_c_to_extent(k); rcu_read_lock(); - extent_for_each_online_device(c, e, ptr, ca) + extent_for_each_online_device_crc(c, e, crc, ptr, ca) if (!ptr_stale(ca, ptr)) { ret = (struct extent_pick_ptr) { + .crc = crc_to_64(crc), .ptr = *ptr, .ca = ca, }; @@ -1469,7 +1703,7 @@ static enum merge_result bch_extent_merge(struct btree_keys *bk, struct btree *b = container_of(bk, struct btree, keys); struct cache_set *c = b->c; struct bkey_s_extent el, er; - unsigned i; + union bch_extent_entry *en_l, *en_r; if (key_merging_disabled(c)) return BCH_MERGE_NOMERGE; @@ -1498,11 +1732,20 @@ static enum merge_result bch_extent_merge(struct btree_keys *bk, el = bkey_i_to_s_extent(l); er = bkey_i_to_s_extent(r); - for (i = 0; i < bch_extent_ptrs(el); i++) { - struct bch_extent_ptr *lp = el.v->ptr + i; - struct bch_extent_ptr *rp = er.v->ptr + i; + extent_for_each_entry(el, en_l) { + struct bch_extent_ptr *lp, *rp; struct cache_member *m; + en_r = bkey_idx(er.v, (u64 *) en_l - el.v->_data); + + if ((extent_entry_type(en_l) != + extent_entry_type(en_r)) || + extent_entry_is_crc(en_l)) + return BCH_MERGE_NOMERGE; + + lp = &en_l->ptr; + rp = &en_r->ptr; + if (lp->offset + el.k->size != rp->offset || lp->dev != rp->dev || lp->gen != rp->gen) @@ -1533,14 +1776,7 @@ static enum merge_result bch_extent_merge(struct btree_keys *bk, bch_cut_front(l->k.p, r); return BCH_MERGE_PARTIAL; } -#if 0 - if (KEY_CSUM(l)) { - if (KEY_CSUM(r)) - l->val[bch_extent_ptrs(l)] = merge_chksums(l, r); - else - SET_KEY_CSUM(l, 0); - } -#endif + bch_key_resize(&l->k, l->k.size + r->k.size); return BCH_MERGE_MERGE; diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h index ad6bcdf185ad..08c039bd0869 100644 --- a/drivers/md/bcache/extents.h +++ b/drivers/md/bcache/extents.h @@ -3,7 +3,10 @@ #include "bkey.h" +#include <linux/bcache.h> + struct bch_replace_info; +union bch_extent_crc; struct btree_nr_keys bch_key_sort_fix_overlapping(struct btree_keys *, struct bset *, @@ -31,6 +34,7 @@ struct cache_set; struct journal_res; struct extent_pick_ptr { + struct bch_extent_crc64 crc; struct bch_extent_ptr ptr; struct cache *ca; }; @@ -53,7 +57,7 @@ bool bch_insert_fixup_extent(struct cache_set *, struct btree *, struct bch_replace_info *, struct bpos *, struct journal_res *, unsigned); -void bch_extent_drop_stale(struct cache_set *c, struct bkey_s); +void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent); bool bch_extent_normalize(struct cache_set *, struct bkey_s); static inline bool bkey_extent_is_data(const struct bkey *k) @@ -80,69 +84,269 @@ static inline void bkey_extent_set_cached(struct bkey *k, bool cached) k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT; } -#define bch_extent_ptrs(_e) bkey_val_u64s((_e).k) +static inline enum bch_extent_entry_type +extent_entry_type(const union bch_extent_entry *e) +{ + int ret = __ffs(e->type); + + EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); -static inline void bch_set_extent_ptrs(struct bkey_s_extent e, unsigned i) + return ret; +} + +static inline size_t __extent_entry_bytes(enum bch_extent_entry_type type) { - BUG_ON(i > BKEY_EXTENT_PTRS_MAX); - set_bkey_val_u64s(e.k, i); + switch (type) { + case BCH_EXTENT_ENTRY_crc32: + return sizeof(struct bch_extent_crc32); + case BCH_EXTENT_ENTRY_crc64: + return sizeof(struct bch_extent_crc64); + case BCH_EXTENT_ENTRY_ptr: + return sizeof(struct bch_extent_ptr); + default: + BUG(); + } } -static inline void bch_extent_drop_ptr(struct bkey_s_extent e, - struct bch_extent_ptr *ptr) +static inline size_t __extent_entry_u64s(enum bch_extent_entry_type type) { - BUG_ON(ptr < e.v->ptr || - ptr >= e.v->ptr + bch_extent_ptrs(e.c)); + return __extent_entry_bytes(type) / sizeof(u64); +} - memmove(ptr, ptr + 1, - (void *) (e.v->ptr + bch_extent_ptrs(e.c)) - - (void *) (ptr + 1)); - e.k->u64s -= sizeof(*ptr) / sizeof(u64); +static inline size_t extent_entry_bytes(const union bch_extent_entry *e) +{ + return __extent_entry_bytes(extent_entry_type(e)); } -static inline bool bch_extent_ptr_is_dirty(const struct cache_set *c, - struct bkey_s_c_extent e, - const struct bch_extent_ptr *ptr) +static inline size_t extent_entry_u64s(const union bch_extent_entry *e) { - /* Dirty pointers come last */ + return extent_entry_bytes(e) / sizeof(u64); +} - if (bkey_extent_is_cached(e.k)) - return false; +static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) +{ + return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; +} - return ptr + CACHE_SET_DATA_REPLICAS_WANT(&c->sb) >= - e.v->ptr + bch_extent_ptrs(e); +static inline bool extent_entry_is_crc(const union bch_extent_entry *e) +{ + return !extent_entry_is_ptr(e); } -#define extent_for_each_ptr(_extent, _ptr) \ - for ((_ptr) = (_extent).v->ptr; \ - (_ptr) < (_extent).v->ptr + bch_extent_ptrs(_extent); \ - (_ptr)++) +union bch_extent_crc { + u8 type; + struct bch_extent_crc32 crc32; + struct bch_extent_crc64 crc64; +}; -/* - * Use this when you'll be dropping pointers as you iterate. - * Any reason we shouldn't just always do this? - */ -#define extent_for_each_ptr_backwards(_extent, _ptr) \ - for ((_ptr) = (_extent).v->ptr + bch_extent_ptrs(_extent) - 1; \ - (_ptr) >= (_extent).v->ptr; \ - --(_ptr)) +enum bch_extent_crc_type { + BCH_EXTENT_CRC_NONE, + BCH_EXTENT_CRC32, + BCH_EXTENT_CRC64, +}; + +static inline enum bch_extent_crc_type +bch_extent_crc_type(const union bch_extent_crc *crc) +{ + if (!crc) + return BCH_EXTENT_CRC_NONE; + + switch (extent_entry_type((void *) crc)) { + case BCH_EXTENT_ENTRY_crc32: + return BCH_EXTENT_CRC32; + case BCH_EXTENT_ENTRY_crc64: + return BCH_EXTENT_CRC64; + default: + BUG(); + } +} + +#define extent_entry_next(_entry) \ + ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) + +#define extent_entry_last(_e) \ + bkey_idx((_e).v, bkey_val_u64s((_e).k)) + +#define extent_for_each_entry_from(_e, _entry, _start) \ + for ((_entry) = _start; \ + (_entry) < extent_entry_last(_e); \ + (_entry) = extent_entry_next(_entry)) + +#define extent_for_each_entry(_e, _entry) \ + extent_for_each_entry_from(_e, _entry, (_e).v->start) + +/* Iterates through entries until it hits a pointer: */ +#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter) \ +({ \ + __label__ out; \ + const union bch_extent_entry *_entry; \ + \ + extent_for_each_entry_from(_e, _entry, (void *) _ptr) \ + if (extent_entry_is_crc(_entry)) { \ + (_crc) = (void *) _entry; \ + } else { \ + _ptr = (typeof(_ptr)) &_entry->ptr; \ + if (_filter) \ + goto out; \ + } \ + \ + _ptr = NULL; \ +out: \ + _ptr; \ +}) -#define __extent_ptr_next_online_device(_c, _extent, _ptr, _ca) \ +#define extent_ptr_next_filter(_e, _ptr, _filter) \ ({ \ - (_ca) = NULL; \ + union bch_extent_crc *_crc; \ \ - while ((_ptr) < (_extent).v->ptr + bch_extent_ptrs(_extent) && \ - !((_ca) = PTR_CACHE(_c, _ptr))) \ - (_ptr)++; \ - (_ca); \ + extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter); \ }) -#define extent_for_each_online_device(_c, _extent, _ptr, _ca) \ - for ((_ptr) = (_extent).v->ptr; \ - ((_ca) = __extent_ptr_next_online_device(_c, _extent, \ - _ptr, _ca)); \ +#define extent_ptr_crc_next(_e, _crc, _ptr) \ + extent_ptr_crc_next_filter(_e, _crc, _ptr, true) + +#define extent_ptr_next(_e, _ptr) \ + extent_ptr_next_filter(_e, _ptr, true) + +#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter) \ + for ((_crc) = NULL, \ + (_ptr) = &(_e).v->start->ptr; \ + ((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\ (_ptr)++) +#define extent_for_each_ptr_from_filter(_e, _ptr, _start, _filter) \ + for ((_ptr) = (_start); \ + ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter)); \ + (_ptr)++) + +#define extent_for_each_ptr_filter(_e, _ptr, _filter) \ + extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr, _filter) + +#define extent_for_each_ptr_crc(_e, _ptr, _crc) \ + extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true) + +#define extent_for_each_ptr_from(_e, _ptr, _start) \ + extent_for_each_ptr_from_filter(_e, _ptr, _start, true) + +#define extent_for_each_ptr(_e, _ptr) \ + extent_for_each_ptr_filter(_e, _ptr, true) + +#define extent_for_each_online_device_crc(_c, _e, _crc, _ptr, _ca) \ + extent_for_each_ptr_crc_filter(_e, _ptr, _crc, \ + ((_ca) = PTR_CACHE(_c, _ptr))) + +#define extent_for_each_online_device(_c, _e, _ptr, _ca) \ + extent_for_each_ptr_filter(_e, _ptr, \ + ((_ca) = PTR_CACHE(_c, _ptr))) + +#define extent_ptr_prev(_e, _ptr) \ +({ \ + typeof(&(_e).v->start->ptr) _p; \ + typeof(&(_e).v->start->ptr) _prev = NULL; \ + \ + extent_for_each_ptr(_e, _p) { \ + if (_p == (_ptr)) \ + break; \ + _prev = _p; \ + } \ + \ + _prev; \ +}) + +/* + * Use this when you'll be dropping pointers as you iterate. Quadratic, + * unfortunately: + */ +#define extent_for_each_ptr_backwards(_e, _ptr) \ + for ((_ptr) = extent_ptr_prev(_e, NULL); \ + (_ptr); \ + (_ptr) = extent_ptr_prev(_e, _ptr)) + +/* + * make sure the type field gets set correctly: + */ +#define __extent_entry_append(_e, _type, _val) \ +do { \ + union bch_extent_entry *_new = \ + extent_entry_last(extent_i_to_s((_e))); \ + \ + (_e)->k.u64s += __extent_entry_u64s(BCH_EXTENT_ENTRY_##_type); \ + BUG_ON(bkey_val_u64s(&(_e)->k) > BKEY_EXTENT_VAL_U64s_MAX); \ + \ + _new->_type = _val; \ + _new->_type.type = 1 << BCH_EXTENT_ENTRY_##_type; \ + \ + BUG_ON(extent_entry_type(_new) != BCH_EXTENT_ENTRY_##_type); \ +} while (0) + +static inline void extent_crc32_append(struct bkey_i_extent *e, + struct bch_extent_crc32 crc) +{ + __extent_entry_append(e, crc32, crc); +} + +static inline void extent_crc64_append(struct bkey_i_extent *e, + struct bch_extent_crc64 crc) +{ + __extent_entry_append(e, crc64, crc); +} + +static inline void extent_ptr_append(struct bkey_i_extent *e, + struct bch_extent_ptr ptr) +{ + __extent_entry_append(e, ptr, ptr); +} + +/* XXX: inefficient */ +static inline bool bch_extent_ptr_is_dirty(const struct cache_set *c, + struct bkey_s_c_extent e, + const struct bch_extent_ptr *ptr) +{ + const struct bch_extent_ptr *i; + unsigned seen = 0; + + if (bkey_extent_is_cached(e.k)) + return false; + + /* Dirty pointers come last */ + extent_for_each_ptr_from(e, i, ptr) + seen++; + + return seen <= CACHE_SET_DATA_REPLICAS_WANT(&c->sb); +} + +static inline struct bch_extent_crc64 crc_to_64(const union bch_extent_crc *crc) +{ + switch (bch_extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + return (struct bch_extent_crc64) { 0 }; + case BCH_EXTENT_CRC32: + return (struct bch_extent_crc64) { + .compressed_size = crc->crc32.compressed_size, + .uncompressed_size = crc->crc32.uncompressed_size, + .offset = crc->crc32.offset, + .csum_type = crc->crc32.csum_type, + .compression_type = crc->crc32.compression_type, + .csum = crc->crc32.csum, + }; + case BCH_EXTENT_CRC64: + return crc->crc64; + default: + BUG(); + } +} + +void extent_adjust_pointers(struct bkey_s_extent, union bch_extent_entry *); + +/* Doesn't cleanup redundant crcs */ +static inline void __bch_extent_drop_ptr(struct bkey_s_extent e, + struct bch_extent_ptr *ptr) +{ + memmove(ptr, ptr + 1, (void *) extent_entry_last(e) - (void *) (ptr + 1)); + e.k->u64s -= sizeof(*ptr) / sizeof(u64); +} + +void bch_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *); bool bch_extent_has_device(struct bkey_s_c_extent, unsigned); bool bch_cut_front(struct bpos, struct bkey_i *); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index d543344be4ab..c0d17ad94623 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -21,6 +21,7 @@ #include "super.h" #include <linux/blkdev.h> +#include <linux/zlib.h> #include <trace/events/bcache.h> @@ -61,29 +62,68 @@ void bch_bio_submit_work(struct work_struct *work) } } -/* Bios with headers */ +/* Allocate, free from mempool: */ -void bch_bbio_prep(struct bbio *b, struct cache *ca) +void bch_bio_free_pages_pool(struct cache_set *c, struct bio *bio) { - struct bvec_iter *iter = &b->bio.bi_iter; + struct bio_vec *bv; + unsigned i; + + bio_for_each_segment_all(bv, bio, i) + mempool_free(bv->bv_page, &c->bio_bounce_pages); +} + +static void bch_bio_alloc_page_pool(struct cache_set *c, struct bio *bio, + bool *using_mempool) +{ + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++]; + + if (likely(!*using_mempool)) { + bv->bv_page = alloc_page(GFP_NOIO); + if (unlikely(!bv->bv_page)) { + mutex_lock(&c->bio_bounce_pages_lock); + *using_mempool = true; + goto pool_alloc; + + } + } else { +pool_alloc: + bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); + } + + bv->bv_len = PAGE_SIZE; + bv->bv_offset = 0; +} + +static void bch_bio_alloc_pages_pool(struct cache_set *c, struct bio *bio, + size_t bytes) +{ + bool using_mempool = false; + + bio->bi_iter.bi_size = bytes; + + while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) + bch_bio_alloc_page_pool(c, bio, &using_mempool); + + if (using_mempool) + mutex_unlock(&c->bio_bounce_pages_lock); +} + +/* Bios with headers */ +static void bch_bbio_prep(struct bbio *b, struct cache *ca) +{ b->ca = ca; b->bio.bi_iter.bi_sector = b->ptr.offset; b->bio.bi_bdev = ca ? ca->disk_sb.bdev : NULL; - - b->bi_idx = iter->bi_idx; - b->bi_bvec_done = iter->bi_bvec_done; } -/* XXX: should be bkey, not bkey_i */ -void bch_submit_bbio(struct bbio *b, struct cache *ca, const struct bkey_i *k, +void bch_submit_bbio(struct bbio *b, struct cache *ca, const struct bch_extent_ptr *ptr, bool punt) { struct bio *bio = &b->bio; - b->key = *k; b->ptr = *ptr; - bch_set_extent_ptrs(bkey_i_to_s_extent(&b->key), 1); bch_bbio_prep(b, ca); b->submit_time_us = local_clock_us(); @@ -100,27 +140,28 @@ void bch_submit_bbio_replicas(struct bch_write_bio *bio, struct cache_set *c, bool punt) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + const struct bch_extent_ptr *ptr; struct cache *ca; - unsigned ptr; + unsigned ptr_idx = 0; BUG_ON(bio->orig); - for (ptr = ptrs_from; - ptr < bch_extent_ptrs(e); - ptr++) { + extent_for_each_ptr(e, ptr) { + if (ptr_idx++ < ptrs_from) + continue; + rcu_read_lock(); - ca = PTR_CACHE(c, &e.v->ptr[ptr]); + ca = PTR_CACHE(c, ptr); if (ca) percpu_ref_get(&ca->ref); rcu_read_unlock(); if (!ca) { - bch_submit_bbio(&bio->bio, ca, k, - &e.v->ptr[ptr], punt); + bch_submit_bbio(&bio->bio, ca, ptr, punt); break; } - if (ptr + 1 < bch_extent_ptrs(e)) { + if (ptr + 1 < &extent_entry_last(e)->ptr) { struct bch_write_bio *n = to_wbio(bio_clone_fast(&bio->bio.bio, GFP_NOIO, &ca->replica_set)); @@ -130,16 +171,17 @@ void bch_submit_bbio_replicas(struct bch_write_bio *bio, struct cache_set *c, n->orig = &bio->bio.bio; __bio_inc_remaining(n->orig); - bch_submit_bbio(&n->bio, ca, k, &e.v->ptr[ptr], punt); + bch_submit_bbio(&n->bio, ca, ptr, punt); } else { - bch_submit_bbio(&bio->bio, ca, k, - &e.v->ptr[ptr], punt); + bch_submit_bbio(&bio->bio, ca, ptr, punt); } } } static void bch_bbio_reset(struct bbio *b) { + BUG(); +#if 0 struct bvec_iter *iter = &b->bio.bi_iter; bio_reset(&b->bio); @@ -147,6 +189,7 @@ static void bch_bbio_reset(struct bbio *b) iter->bi_size = b->key.k.size << 9; iter->bi_idx = b->bi_idx; iter->bi_bvec_done = b->bi_bvec_done; +#endif } /* IO errors */ @@ -268,25 +311,198 @@ static inline bool version_stress_test(struct cache_set *c) #endif } -static void __bch_write(struct closure *); - -#if 0 -static void bio_csum(struct bio *bio, struct bkey *k) +static u32 checksum_bio(struct bio *bio, unsigned type) { struct bio_vec bv; struct bvec_iter iter; - u64 crc = 0xffffffffffffffffULL; + u32 csum = U32_MAX; + + if (type == BCH_CSUM_NONE) + return 0; bio_for_each_segment(bv, bio, iter) { - void *d = kmap(bv.bv_page) + bv.bv_offset; + void *p = kmap_atomic(bv.bv_page); - crc = bch_checksum_update(KEY_CSUM(k), crc, d, bv.bv_len); - kunmap(bv.bv_page); + csum = bch_checksum_update(type, csum, + p + bv.bv_offset, + bv.bv_len); + kunmap_atomic(p); } - k->val[bch_extent_ptrs(k)] = crc; + return csum ^= U32_MAX; } -#endif + +static int bio_compress_gzip(struct cache_set *c, struct bio *dst, + struct bio *src, unsigned output_available) +{ + struct bvec_iter src_iter = src->bi_iter; + z_stream strm; + struct page *workspace; + struct page *inp = NULL; + void *k_in = NULL; + bool using_mempool = false; + int ret; + + BUG_ON(dst->bi_iter.bi_size); + + workspace = mempool_alloc(&c->compression_workspace_pool, GFP_NOIO); + strm.workspace = page_address(workspace); + + zlib_deflateInit(&strm, 3); + strm.next_in = NULL; + strm.next_out = NULL; + strm.avail_out = 0; + strm.avail_in = 0; + + while (1) { + if (!strm.avail_out) { + struct bio_vec *bv = &dst->bi_io_vec[dst->bi_vcnt]; + + if (!output_available) { + /* + * XXX: this really shouldn't happen, accounting + * is screwed up somehow: + */ + //pr_err("output_available == 0"); + goto err; + } + + BUG_ON(dst->bi_vcnt >= dst->bi_max_vecs); + + if (k_in) { + kunmap_atomic(k_in); + + bch_bio_alloc_page_pool(c, dst, &using_mempool); + + strm.next_in = kmap_atomic(inp) + + (((unsigned long) strm.next_in) & + (PAGE_SIZE - 1)); + } else { + bch_bio_alloc_page_pool(c, dst, &using_mempool); + } + + strm.next_out = page_address(bv->bv_page); + strm.avail_out = min_t(unsigned, PAGE_SIZE, + output_available); + + dst->bi_iter.bi_size += strm.avail_out; + output_available -= strm.avail_out; + } + + if (!strm.avail_in && src_iter.bi_size && + output_available > PAGE_SIZE * 3 / 2) { + struct bio_vec bv = bio_iter_iovec(src, src_iter); + + if (k_in) + kunmap_atomic(k_in); + + strm.avail_in = bv.bv_len; + inp = bv.bv_page; + k_in = kmap_atomic(inp); + strm.next_in = k_in + bv.bv_offset; + + bio_advance_iter(src, &src_iter, strm.avail_in); + } + + ret = zlib_deflate(&strm, strm.avail_in + ? Z_NO_FLUSH : Z_FINISH); + if (ret == Z_STREAM_END) + break; + + BUG_ON(ret != Z_OK); + } + + ret = zlib_deflateEnd(&strm); + BUG_ON(ret != Z_OK); + + BUG_ON(strm.total_out > dst->bi_iter.bi_size); + + /* caller will pad with 0s to block boundary */ + dst->bi_iter.bi_size = strm.total_out; + + /* return number of bytes consumed */ + ret = src->bi_iter.bi_size - src_iter.bi_size; +out: + if (k_in) + kunmap_atomic(k_in); + if (using_mempool) + mutex_unlock(&c->bio_bounce_pages_lock); + mempool_free(workspace, &c->compression_workspace_pool); + + return ret; +err: + ret = -1; + goto out; +} + +static unsigned bio_compress(struct cache_set *c, struct bio *dst, + struct bio *src, unsigned *compression_type, + unsigned output_available) +{ + int ret = 0; + + /* if it's only one block, don't bother trying to compress: */ + if (bio_sectors(src) <= c->sb.block_size) + *compression_type = BCH_COMPRESSION_NONE; + + switch (*compression_type) { + case BCH_COMPRESSION_NONE: + /* Just bounce it, for stable checksums: */ +copy: + bch_bio_alloc_pages_pool(c, dst, output_available); + bio_copy_data(dst, src); + return output_available; + case BCH_COMPRESSION_LZO1X: + BUG(); + case BCH_COMPRESSION_GZIP: + ret = bio_compress_gzip(c, dst, src, output_available); + break; + case BCH_COMPRESSION_XZ: + BUG(); + default: + BUG(); + } + + if (ret < 0) { + /* Failed to compress (didn't get smaller): */ + *compression_type = BCH_COMPRESSION_NONE; + goto copy; + } + + BUG_ON(ret & ((1 << (c->block_bits + 9)) - 1)); + + if (DIV_ROUND_UP(dst->bi_iter.bi_size, block_bytes(c)) >= + ret >> (c->block_bits + 9)) { + /* Failed to compress (didn't get smaller): */ + *compression_type = BCH_COMPRESSION_NONE; + goto copy; + } + + /* Pad to blocksize, and zero out padding: */ + while (dst->bi_iter.bi_size & (block_bytes(c) - 1)) { + unsigned idx = dst->bi_iter.bi_size >> PAGE_SHIFT; + unsigned offset = dst->bi_iter.bi_size & (PAGE_SIZE - 1); + unsigned bytes = (PAGE_SIZE - offset) & (block_bytes(c) - 1); + + if (idx < dst->bi_vcnt) { + struct bio_vec *bv = &dst->bi_io_vec[idx]; + + memset(page_address(bv->bv_page) + offset, 0, bytes); + } else { + dst->bi_io_vec[dst->bi_vcnt++] = (struct bio_vec) { + .bv_page = ZERO_PAGE(0), + .bv_len = PAGE_SIZE, + .bv_offset = 0, + }; + } + + dst->bi_iter.bi_size += bytes; + } + + return ret; +} + +static void __bch_write(struct closure *); static void bch_write_done(struct closure *cl) { @@ -302,6 +518,11 @@ static void bch_write_done(struct closure *cl) if (!op->write_done) continue_at(cl, __bch_write, op->io_wq); + if (op->replace_collision) { + trace_bcache_promote_collision(&op->replace_info.key.k); + atomic_inc(&op->c->accounting.collector.cache_miss_collisions); + } + percpu_ref_put(&op->c->writes); bch_keylist_free(&op->insert_keys); closure_return(cl); @@ -381,6 +602,7 @@ static void bch_write_endio(struct bio *bio) { struct closure *cl = bio->bi_private; struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_write_bio *wbio = to_wbio(bio); if (bio->bi_error) { /* TODO: We could try to recover from this. */ @@ -393,14 +615,277 @@ static void bch_write_endio(struct bio *bio) set_closure_fn(cl, NULL, NULL); } - bch_bbio_endio(to_bbio(bio), bio->bi_error, "writing data to cache"); + if (wbio->orig) + bio_endio(wbio->orig); + else if (wbio->bounce) + bch_bio_free_pages_pool(op->c, bio); + + bch_bbio_endio(&wbio->bio, bio->bi_error, "writing data to cache"); +} + +static const unsigned bch_crc_size[] = { + [BCH_CSUM_NONE] = 0, + [BCH_CSUM_CRC32C] = 4, + [BCH_CSUM_CRC64] = 8, +}; + +/* + * We're writing another replica for this extent, so while we've got the data in + * memory we'll be computing a new checksum for the currently live data. + * + * If there are other replicas we aren't moving, and they are checksummed but + * not compressed, we can modify them to point to only the data that is + * currently live (so that readers won't have to bounce) while we've got the + * checksum we need: + * + * XXX: to guard against data being corrupted while in memory, instead of + * recomputing the checksum here, it would be better in the read path to instead + * of computing the checksum of the entire extent: + * + * | extent | + * + * compute the checksums of the live and dead data separately + * | dead data || live data || dead data | + * + * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then + * use crc_live here (that we verified was correct earlier) + */ +static void extent_cleanup_checksums(struct bkey_s_extent e, + u64 csum, unsigned csum_type) +{ + union bch_extent_entry *entry; + + extent_for_each_entry(e, entry) + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + continue; + case BCH_EXTENT_ENTRY_crc32: + if (entry->crc32.compression_type != BCH_COMPRESSION_NONE || + bch_crc_size[csum_type] > sizeof(entry->crc32.csum)) + continue; + + extent_adjust_pointers(e, entry); + entry->crc32.compressed_size = e.k->size; + entry->crc32.uncompressed_size = e.k->size; + entry->crc32.offset = 0; + entry->crc32.csum_type = csum_type; + entry->crc32.csum = csum; + break; + case BCH_EXTENT_ENTRY_crc64: + if (entry->crc64.compression_type != BCH_COMPRESSION_NONE || + bch_crc_size[csum_type] > sizeof(entry->crc64.csum)) + continue; + + extent_adjust_pointers(e, entry); + entry->crc64.compressed_size = e.k->size; + entry->crc64.uncompressed_size = e.k->size; + entry->crc64.offset = 0; + entry->crc64.csum_type = csum_type; + entry->crc64.csum = csum; + break; + } +} + +static void extent_checksum_append(struct bkey_i_extent *e, + unsigned compressed_size, + unsigned uncompressed_size, + unsigned compression_type, + u64 csum, unsigned csum_type) +{ + struct bch_extent_ptr *ptr; + union bch_extent_crc *crc; + + BUG_ON(compressed_size > uncompressed_size); + BUG_ON(uncompressed_size != e->k.size); + + /* + * Look up the last crc entry, so we can check if we need to add + * another: + */ + extent_for_each_ptr_crc(extent_i_to_s(e), ptr, crc) + ; + + switch (bch_extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + if (csum_type == BCH_CSUM_NONE && + compression_type == BCH_COMPRESSION_NONE) + return; + break; + case BCH_EXTENT_CRC32: + if (crc->crc32.compressed_size == compressed_size && + crc->crc32.uncompressed_size == uncompressed_size && + crc->crc32.offset == 0 && + crc->crc32.compression_type == compression_type && + crc->crc32.csum_type == csum_type && + crc->crc32.csum == csum) + return; + break; + case BCH_EXTENT_CRC64: + if (crc->crc64.compressed_size == compressed_size && + crc->crc64.uncompressed_size == uncompressed_size && + crc->crc64.offset == 0 && + crc->crc32.compression_type == compression_type && + crc->crc64.csum_type == csum_type && + crc->crc64.csum == csum) + return; + break; + } + + switch (csum_type) { + case BCH_CSUM_NONE: + case BCH_CSUM_CRC32C: + BUG_ON(compressed_size > CRC32_EXTENT_SIZE_MAX || + uncompressed_size > CRC32_EXTENT_SIZE_MAX); + + extent_crc32_append(e, (struct bch_extent_crc32) { + .compressed_size = compressed_size, + .uncompressed_size = uncompressed_size, + .offset = 0, + .compression_type = compression_type, + .csum_type = csum_type, + .csum = csum, + }); + break; + case BCH_CSUM_CRC64: + BUG_ON(compressed_size > CRC64_EXTENT_SIZE_MAX || + uncompressed_size > CRC64_EXTENT_SIZE_MAX); + + extent_crc64_append(e, (struct bch_extent_crc64) { + .compressed_size = compressed_size, + .uncompressed_size = uncompressed_size, + .offset = 0, + .compression_type = compression_type, + .csum_type = csum_type, + .csum = csum, + }); + break; + default: + BUG(); + } +} + +static void bch_write_extent(struct bch_write_op *op, + struct open_bucket *ob, + struct bkey_i *k, struct bio *orig) +{ + struct cache_set *c = op->c; + struct bio *bio; + struct bch_write_bio *wbio; + struct bkey_i_extent *e = bkey_i_to_extent(k); + struct bch_extent_ptr *ptr; + unsigned ptrs_from = 0; + unsigned csum_type = CACHE_DATA_PREFERRED_CSUM_TYPE(&c->sb); + unsigned compression_type = CACHE_COMPRESSION_TYPE(&c->sb); + + /* don't refetch csum type/compression type */ + barrier(); + + extent_for_each_ptr(extent_i_to_s(e), ptr) + ptrs_from++; + + if (csum_type != BCH_CSUM_NONE || + compression_type != BCH_COMPRESSION_NONE) { + /* all units here in bytes */ + unsigned output_available, input_available, input_consumed; + u64 csum; + + BUG_ON(bio_sectors(orig) != k->k.size); + + /* XXX: decide extent size better: */ + output_available = min(k->k.size, + min(ob->sectors_free, + CRC32_EXTENT_SIZE_MAX)) << 9; + + input_available = min(orig->bi_iter.bi_size, + CRC32_EXTENT_SIZE_MAX << 9); + + /* + * temporarily set input bio's size to the max we want to + * consume from it, in order to avoid overflow in the crc info + */ + swap(orig->bi_iter.bi_size, input_available); + + bio = bio_alloc_bioset(GFP_NOIO, + DIV_ROUND_UP(output_available, PAGE_SIZE), + &c->bio_write); + wbio = to_wbio(bio); + wbio->orig = NULL; + wbio->bounce = true; + + input_consumed = bio_compress(c, bio, orig, + &compression_type, + output_available); + + swap(orig->bi_iter.bi_size, input_available); + + bch_key_resize(&k->k, input_consumed >> 9); + bio_advance(orig, input_consumed); + + /* + * XXX: could move checksumming out from under the open + * bucket lock - but compression is also being done + * under it + */ + csum = checksum_bio(bio, csum_type); + + /* + * If possible, adjust existing pointers to only point to + * currently live data, while we have the checksum for that + * data: + */ + extent_cleanup_checksums(extent_i_to_s(e), csum, csum_type); + + /* + * Add a bch_extent_crc header for the pointers that + * bch_alloc_sectors_done() is going to append: + */ + extent_checksum_append(e, bio_sectors(bio), e->k.size, + compression_type, + csum, csum_type); + + bch_alloc_sectors_done(op->c, op->wp, k, ob, bio_sectors(bio)); + } else { + if (k->k.size > ob->sectors_free) + bch_key_resize(&k->k, ob->sectors_free); + + /* + * We might need a checksum entry, if there's a previous + * checksum entry we need to override: + */ + extent_checksum_append(e, k->k.size, k->k.size, + compression_type, 0, csum_type); + + bch_alloc_sectors_done(op->c, op->wp, k, ob, k->k.size); + + bio = bio_next_split(orig, k->k.size, GFP_NOIO, + &op->c->bio_write); + if (bio == orig) + bio_get(bio); + + wbio = to_wbio(bio); + wbio->orig = NULL; + wbio->bounce = false; + } + + bio->bi_end_io = bch_write_endio; + bio->bi_private = &op->cl; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + +#ifndef CONFIG_BCACHE_NO_IO + bch_submit_bbio_replicas(wbio, op->c, k, ptrs_from, false); +#else + ptrs_from = ptrs_from; + bch_bbio_prep(&wbio->bio, NULL); + closure_get(bio->bi_private); + bio_endio(bio); +#endif } static void __bch_write(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bio *bio = &op->bio->bio.bio, *n; - unsigned open_bucket_nr = 0, ptrs_from; + struct bio *bio = &op->bio->bio.bio; + unsigned open_bucket_nr = 0; struct open_bucket *b; memset(op->open_buckets, 0, sizeof(op->open_buckets)); @@ -412,8 +897,9 @@ static void __bch_write(struct closure *cl) continue_at(cl, bch_write_done, op->c->wq); } - bch_extent_drop_stale(op->c, bkey_i_to_s(&op->insert_key)); - ptrs_from = bch_extent_ptrs(bkey_i_to_s_extent(&op->insert_key)); + if (bkey_extent_is_data(&op->insert_key.k)) + bch_extent_drop_stale(op->c, + bkey_i_to_s_extent(&op->insert_key)); /* * Journal writes are marked REQ_PREFLUSH; if the original write was a @@ -438,9 +924,9 @@ static void __bch_write(struct closure *cl) k = op->insert_keys.top; bkey_copy(k, &op->insert_key); - b = bch_alloc_sectors(op->c, op->wp, k, - op->check_enospc, - op->nowait ? NULL : cl); + b = bch_alloc_sectors_start(op->c, op->wp, + op->check_enospc, + op->nowait ? NULL : cl); BUG_ON(!b); if (PTR_ERR(b) == -EAGAIN) { @@ -458,30 +944,15 @@ static void __bch_write(struct closure *cl) op->open_buckets[open_bucket_nr++] = b; + /* + * XXX: if we compressed, we didn't use all the space we just + * allocated + */ + bch_write_extent(op, b, k, bio); bch_cut_front(k->k.p, &op->insert_key); - n = bio_next_split(bio, k->k.size, GFP_NOIO, - &op->c->bio_write); - if (n == bio) - bio_get(bio); - - n->bi_end_io = bch_write_endio; - n->bi_private = cl; -#if 0 - if (KEY_CSUM(k)) - bio_csum(n, k); -#endif - trace_bcache_cache_insert(&k->k); - - bio_set_op_attrs(n, REQ_OP_WRITE, 0); -#ifndef CONFIG_BCACHE_NO_IO - bch_submit_bbio_replicas(to_wbio(n), op->c, k, - ptrs_from, false); -#else - bch_bbio_prep(to_bbio(n), NULL); - closure_get(n->bi_private); - bio_endio(n); -#endif + BUG_ON(op->insert_key.k.size && + op->insert_key.k.size != bio_sectors(bio)); BUG_ON(bch_extent_normalize(op->c, bkey_i_to_s(k))); bch_check_mark_super(op->c, k, false); @@ -489,7 +960,9 @@ static void __bch_write(struct closure *cl) bkey_extent_set_cached(&k->k, op->cached); bch_keylist_enqueue(&op->insert_keys); - } while (n != bio); + + trace_bcache_cache_insert(&k->k); + } while (op->insert_key.k.size); op->write_done = true; continue_at(cl, bch_write_index, op->c->wq); @@ -775,69 +1248,17 @@ struct cache_promote_op { struct closure cl; struct bio *orig_bio; struct bch_write_op iop; - bool stale; /* was the ptr stale after the read? */ struct bch_write_bio bio; /* must be last */ }; static void cache_promote_done(struct closure *cl) { - struct cache_promote_op *op = container_of(cl, - struct cache_promote_op, cl); - struct cache_set *c = op->iop.c; - - if (op->iop.replace_collision) { - trace_bcache_promote_collision(&op->iop.replace_info.key.k); - atomic_inc(&c->accounting.collector.cache_miss_collisions); - } - - bch_bio_free_pages(&op->iop.bio->bio.bio); - kfree(op); -} - -static void cache_promote_write(struct closure *cl) -{ - struct cache_promote_op *op = container_of(cl, - struct cache_promote_op, cl); - struct bio *bio = &op->iop.bio->bio.bio; - - bio_reset(bio); - bio->bi_iter.bi_sector = bkey_start_offset(&op->iop.insert_key.k); - bio->bi_iter.bi_size = op->iop.insert_key.k.size << 9; - /* needed to reinit bi_vcnt so pages can be freed later */ - bch_bio_map(bio, NULL); - - bio_copy_data(op->orig_bio, bio); - op->orig_bio->bi_error = op->iop.error; - bio_endio(op->orig_bio); - - if (!op->stale && - !op->iop.error && - !test_bit(CACHE_SET_RO, &op->iop.c->flags) && - !test_bit(CACHE_SET_STOPPING, &op->iop.c->flags)) - closure_call(&op->iop.cl, bch_write, NULL, cl); - - closure_return_with_destructor(cl, cache_promote_done); -} - -static void cache_promote_endio(struct bio *bio) -{ - struct bbio *b = to_bbio(bio); struct cache_promote_op *op = - container_of(bio, struct cache_promote_op, bio.bio.bio); + container_of(cl, struct cache_promote_op, cl); - /* - * If the bucket was reused while our bio was in flight, we might have - * read the wrong data. Set s->error but not error so it doesn't get - * counted against the cache device, but we'll still reread the data - * from the backing device. - */ - - if (bio->bi_error) - op->iop.error = bio->bi_error; - else if (b->ca && ptr_stale(b->ca, &b->ptr)) - op->stale = 1; - - bch_bbio_endio(b, bio->bi_error, "reading from cache"); + bch_bio_free_pages_pool(op->iop.c, op->orig_bio); + bio_put(op->orig_bio); + kfree(op); } /** @@ -852,6 +1273,7 @@ void __cache_promote(struct cache_set *c, struct bbio *orig_bio, struct bkey_s_c new, unsigned write_flags) { +#if 0 struct cache_promote_op *op; struct bio *bio; unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE); @@ -886,10 +1308,12 @@ void __cache_promote(struct cache_set *c, struct bbio *orig_bio, op->stale = 0; bch_write_op_init(&op->iop, c, &op->bio, &c->promote_write_point, - new, old, BCH_WRITE_CHECK_ENOSPC|write_flags); + new, old, + BCH_WRITE_CHECK_ENOSPC| + BCH_WRITE_ALLOC_NOWAIT|write_flags); - bch_cut_front(bkey_start_pos(&orig_bio->key.k), &op->iop.insert_key); - bch_cut_back(orig_bio->key.k.p, &op->iop.insert_key.k); + //bch_cut_front(bkey_start_pos(&orig_bio->key.k), &op->iop.insert_key); + //bch_cut_back(orig_bio->key.k.p, &op->iop.insert_key.k); trace_bcache_promote(&orig_bio->bio); @@ -901,24 +1325,7 @@ out_free: kfree(op); out_submit: generic_make_request(&orig_bio->bio); -} - -/** - * cache_promote - promote data stored in higher tiers - * - * Used for flash only volumes. - * - * @bio must actually be a bbio with valid key. - */ -bool cache_promote(struct cache_set *c, struct bbio *bio, struct bkey_s_c k) -{ - if (!CACHE_TIER(&bio->ca->mi)) { - generic_make_request(&bio->bio); - return 0; - } - - __cache_promote(c, bio, k, k, BCH_WRITE_ALLOC_NOWAIT); - return 1; +#endif } /* Read */ @@ -927,36 +1334,325 @@ static void bch_read_requeue(struct cache_set *c, struct bio *bio) { unsigned long flags; + BUG(); + spin_lock_irqsave(&c->read_race_lock, flags); bio_list_add(&c->read_race_list, bio); spin_unlock_irqrestore(&c->read_race_lock, flags); queue_work(c->wq, &c->read_race_work); } -static void bch_read_endio(struct bio *bio) +static int bio_uncompress_gzip(struct cache_set *c, + struct bio *dst, struct bvec_iter dst_iter, + struct bio *src, struct bvec_iter src_iter, + unsigned skip) { - struct bbio *b = to_bbio(bio); - struct cache *ca = b->ca; - struct bio *orig = bio->bi_private; - - bch_bbio_count_io_errors(b, bio->bi_error, "reading from cache"); - - if (!bio->bi_error && ca && - (race_fault() || - ptr_stale(ca, &b->ptr))) { - /* Read bucket invalidate race */ - atomic_long_inc(&ca->set->cache_read_races); - bch_read_requeue(ca->set, bio); + z_stream strm; + struct page *workspace; + void *k_out = NULL; + u8 garbage[128]; + int ret; + bool decompress_all = true; + + workspace = mempool_alloc(&c->compression_workspace_pool, GFP_NOIO); + strm.workspace = page_address(workspace); + + zlib_inflateInit(&strm); + strm.next_in = NULL; + strm.next_out = NULL; + strm.avail_out = 0; + strm.avail_in = 0; + + do { + if (strm.avail_out) { + ; + } else if (skip) { + strm.avail_out = min_t(unsigned, sizeof(garbage), skip); + strm.next_out = garbage; + + skip -= strm.avail_out; + } else if (dst_iter.bi_size) { + struct bio_vec bv = bio_iter_iovec(dst, dst_iter); + + if (k_out) + kunmap_atomic(k_out); + k_out = kmap_atomic(bv.bv_page) + bv.bv_offset; + + strm.avail_out = bv.bv_len; + strm.next_out = k_out; + + bio_advance_iter(dst, &dst_iter, bv.bv_len); + } else { + /* Uncompressed all the data we actually want: */ + if (!decompress_all) { + ret = Z_STREAM_END; + break; + } + + strm.avail_out = sizeof(garbage); + strm.next_out = garbage; + } + + if (!strm.avail_in && src_iter.bi_size) { + struct bio_vec bv = bio_iter_iovec(src, src_iter); + + strm.avail_in = bv.bv_len; + strm.next_in = page_address(bv.bv_page) + bv.bv_offset; + + bio_advance_iter(src, &src_iter, bv.bv_len); + } + } while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK); + + if (k_out) + kunmap_atomic(k_out); + + mempool_free(workspace, &c->compression_workspace_pool); + + return ret == Z_STREAM_END ? 0 : -EIO; +} + +static int bio_checksum_uncompress(struct bch_read_bio *rbio) +{ + struct bio *bio = &rbio->bio.bio; + int ret = 0; + + /* reset iterator for checksum */ + bio->bi_iter.bi_size = rbio->compressed_size << 9; + bio->bi_iter.bi_idx = 0; + bio->bi_iter.bi_bvec_done = 0; + + if (rbio->csum_type != BCH_CSUM_NONE && + rbio->csum != checksum_bio(bio, rbio->csum_type)) { + /* + * XXX: bch_bbio_count_io_errors() isn't counting checksum + * errors + */ + __bcache_io_error(rbio->c, "checksum error"); + return -EIO; + } + + switch (rbio->compression_type) { + case BCH_COMPRESSION_NONE: + if (rbio->bounce) { + bio_advance(bio, rbio->offset << 9); + bio_copy_data_iter(rbio->parent, rbio->parent_iter, + bio, bio->bi_iter); + } + break; + case BCH_COMPRESSION_LZO1X: + BUG(); + case BCH_COMPRESSION_GZIP: + ret = bio_uncompress_gzip(rbio->c, + rbio->parent, + rbio->parent_iter, + bio, bio->bi_iter, + rbio->offset << 9); + break; + case BCH_COMPRESSION_XZ: + BUG(); + default: + BUG(); + } + + if (ret) + __bcache_io_error(rbio->c, "decompression error"); + + return ret; +} + +/* Inner part that may run in process context */ +static void __bch_read_endio(struct bch_read_bio *rbio) +{ + struct bio *bio = &rbio->bio.bio; + int ret; + + ret = bio_checksum_uncompress(rbio); + if (ret) + rbio->parent->bi_error = ret; + bio_endio(rbio->parent); + + if (!ret && rbio->promote && + !test_bit(CACHE_SET_RO, &rbio->c->flags) && + !test_bit(CACHE_SET_STOPPING, &rbio->c->flags)) { + struct closure *cl = &rbio->promote->cl; + + closure_init(cl, &rbio->c->cl); + closure_call(&rbio->promote->iop.cl, bch_write, rbio->c->wq, cl); + closure_return_with_destructor(cl, cache_promote_done); } else { - if (bio->bi_error) - orig->bi_error = bio->bi_error; + if (rbio->promote) + kfree(rbio->promote); + if (rbio->bounce) + bch_bio_free_pages_pool(rbio->c, bio); - bio_endio(orig); bio_put(bio); } +} - if (ca) - percpu_ref_put(&ca->ref); +void bch_bio_decompress_work(struct work_struct *work) +{ + struct bio_decompress_worker *d = + container_of(work, struct bio_decompress_worker, work); + struct llist_node *list, *next; + struct bch_read_bio *rbio; + + while ((list = llist_del_all(&d->bio_list))) + for (list = llist_reverse_order(list); + list; + list = next) { + next = llist_next(list); + rbio = container_of(list, struct bch_read_bio, list); + + __bch_read_endio(rbio); + } +} + +static void bch_read_endio(struct bio *bio) +{ + struct bch_read_bio *rbio = + container_of(bio, struct bch_read_bio, bio.bio); + bool stale = //race_fault() || + ptr_stale(rbio->bio.ca, &rbio->bio.ptr); + int error = bio->bi_error; + + bch_bbio_count_io_errors(&rbio->bio, error, "reading from cache"); + percpu_ref_put(&rbio->bio.ca->ref); + + if (error) + goto out; + + if (stale) + goto stale; + + if (rbio->compression_type != BCH_COMPRESSION_NONE) { + struct bio_decompress_worker *d; + + preempt_disable(); + d = this_cpu_ptr(rbio->c->bio_decompress_worker); + llist_add(&rbio->list, &d->bio_list); + queue_work(system_unbound_wq, &d->work); + preempt_enable(); + } else { + __bch_read_endio(rbio); + } + + return; +stale: + if (rbio->promote) + kfree(rbio->promote); + rbio->promote = NULL; + + /* Raced with the bucket being reused and invalidated: */ + if (rbio->flags & BCH_READ_RETRY_IF_STALE) { + atomic_long_inc(&rbio->c->cache_read_races); + bch_read_requeue(rbio->c, bio); + return; + } + + error = -EINTR; +out: + if (rbio->promote) + kfree(rbio->promote); + if (error) + rbio->parent->bi_error = error; + bio_endio(rbio->parent); + bio_put(bio); +} + +void bch_read_extent(struct cache_set *c, struct bio *orig, + struct bkey_s_c k, struct extent_pick_ptr *pick, + unsigned skip, unsigned flags) +{ + struct bio *bio; + struct bch_read_bio *rbio; + struct cache_promote_op *promote_op = NULL; + bool bounce = false, read_full = false; + + /* only promote if we're not reading from the fastest tier: */ + if ((flags & BCH_READ_PROMOTE) && CACHE_TIER(&pick->ca->mi)) { + promote_op = kmalloc(sizeof(*promote_op), GFP_NOIO); + + if (promote_op) + bounce = true; + } + + /* + * note: if compression_type and crc_type both == none, then + * compressed/uncompressed size is zero + */ + if (pick->crc.compression_type != BCH_COMPRESSION_NONE || + (pick->crc.csum_type != BCH_CSUM_NONE && + (bio_sectors(orig) != pick->crc.uncompressed_size || + (flags & BCH_READ_FORCE_BOUNCE)))) { + read_full = true; + bounce = true; + } + + if (bounce) { + unsigned sectors = + !read_full ? bio_sectors(orig) + : pick->crc.compressed_size ?: k.k->size; + + bio = bio_alloc_bioset(GFP_NOIO, + DIV_ROUND_UP(sectors, PAGE_SECTORS), + &c->bio_read); + bch_bio_alloc_pages_pool(c, bio, sectors << 9); + } else { + bio = bio_clone_fast(orig, GFP_NOIO, &c->bio_read); + } + + rbio = container_of(bio, struct bch_read_bio, bio.bio); + memset(rbio, 0, offsetof(struct bch_read_bio, bio)); + + rbio->csum = pick->crc.csum; + rbio->compressed_size = pick->crc.compressed_size; + rbio->uncompressed_size = pick->crc.uncompressed_size; + rbio->offset = pick->crc.offset; + rbio->csum_type = pick->crc.csum_type; + rbio->compression_type = pick->crc.compression_type; + + __bio_inc_remaining(orig); + rbio->parent = orig; + rbio->parent_iter = orig->bi_iter; + rbio->c = c; + rbio->flags = flags; + rbio->bounce = bounce; + rbio->promote = promote_op; + rbio->bio.ptr = pick->ptr; + bio->bi_end_io = bch_read_endio; + bch_bbio_prep(&rbio->bio, pick->ca); + + if (read_full) + rbio->offset += skip; + else + bio->bi_iter.bi_sector += skip; + + if (promote_op) { + promote_op->orig_bio = bio; + + bch_write_op_init(&promote_op->iop, c, + &promote_op->bio, + &c->promote_write_point, + k, k, + BCH_WRITE_CHECK_ENOSPC| + BCH_WRITE_ALLOC_NOWAIT); + + if (!read_full) { + bch_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + skip), + &promote_op->iop.insert_key); + bch_key_resize(&promote_op->iop.insert_key.k, + bio_sectors(orig)); + } + + __bio_clone_fast(&promote_op->bio.bio.bio, bio); + } + +#ifndef CONFIG_BCACHE_NO_IO + generic_make_request(bio); +#else + bio_endio(bio); +#endif } /* XXX: this looks a lot like cache_lookup_fn() */ @@ -970,9 +1666,7 @@ int bch_read(struct cache_set *c, struct bio *bio, u64 inode) for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS, POS(inode, bio->bi_iter.bi_sector), k) { struct extent_pick_ptr pick; - struct bio *n; - struct bbio *bbio; - unsigned sectors; + unsigned bytes, sectors; bool done; BUG_ON(bkey_cmp(bkey_start_pos(k.k), @@ -981,8 +1675,12 @@ int bch_read(struct cache_set *c, struct bio *bio, u64 inode) BUG_ON(bkey_cmp(k.k->p, POS(inode, bio->bi_iter.bi_sector)) <= 0); - sectors = k.k->p.offset - bio->bi_iter.bi_sector; - done = sectors >= bio_sectors(bio); + sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) - + bio->bi_iter.bi_sector; + bytes = sectors << 9; + done = bytes == bio->bi_iter.bi_size; + + swap(bio->bi_iter.bi_size, bytes); pick = bch_extent_pick_ptr(c, k); if (IS_ERR(pick.ca)) { @@ -994,43 +1692,19 @@ int bch_read(struct cache_set *c, struct bio *bio, u64 inode) PTR_BUCKET(pick.ca, &pick.ptr)->read_prio = c->prio_clock[READ].hand; - n = sectors >= bio_sectors(bio) - ? bio_clone_fast(bio, GFP_NOIO, &c->bio_split) - : bio_split(bio, sectors, GFP_NOIO, - &c->bio_split); - - n->bi_private = bio; - n->bi_end_io = bch_read_endio; - __bio_inc_remaining(bio); - - bbio = to_bbio(n); - bbio->key.k = *k.k; - bbio->ptr = pick.ptr; - bch_set_extent_ptrs(bkey_i_to_s_extent(&bbio->key), 1); - - /* Trim the key to match what we're actually reading */ - bch_cut_front(POS(inode, n->bi_iter.bi_sector), - &bbio->key); - bch_cut_back(POS(inode, bio_end_sector(n)), - &bbio->key.k); - bch_bbio_prep(bbio, pick.ca); - -#ifndef CONFIG_BCACHE_NO_IO - cache_promote(c, bbio, k); -#else - bio_endio(n); -#endif + bch_read_extent(c, bio, k, &pick, + bio->bi_iter.bi_sector - + bkey_start_offset(k.k), + BCH_READ_FORCE_BOUNCE| + BCH_READ_RETRY_IF_STALE| + BCH_READ_PROMOTE); } else { - unsigned bytes = min_t(unsigned, sectors, - bio_sectors(bio)) << 9; - - swap(bio->bi_iter.bi_size, bytes); zero_fill_bio(bio); - swap(bio->bi_iter.bi_size, bytes); - - bio_advance(bio, bytes); } + swap(bio->bi_iter.bi_size, bytes); + bio_advance(bio, bytes); + if (done) { bch_btree_iter_unlock(&iter); return 0; @@ -1069,7 +1743,8 @@ static void bch_read_retry(struct bbio *bbio) * The inode, offset and size come from the bbio's key, * which was set by bch_read_fn(). */ - inode = bbio->key.k.p.inode; + BUG(); /* currently broken */ + //inode = bbio->key.k.p.inode; parent = bio->bi_private; bch_bbio_reset(bbio); diff --git a/drivers/md/bcache/io.h b/drivers/md/bcache/io.h index fb024d2e5fa8..53c8b3aa07ea 100644 --- a/drivers/md/bcache/io.h +++ b/drivers/md/bcache/io.h @@ -1,17 +1,16 @@ #ifndef _BCACHE_IO_H #define _BCACHE_IO_H -struct bbio { - struct cache *ca; +#include <linux/zlib.h> - unsigned int bi_idx; /* current index into bvl_vec */ +#define COMPRESSION_WORKSPACE_SIZE \ + max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), \ + zlib_inflate_workspacesize()) - unsigned int bi_bvec_done; /* number of bytes completed in - current bvec */ - unsigned submit_time_us; - struct bkey_i key; +struct bbio { + struct cache *ca; struct bch_extent_ptr ptr; - /* Only ever have a single pointer (the one we're doing io to/from) */ + unsigned submit_time_us; struct bio bio; }; @@ -95,6 +94,41 @@ void bch_write_op_init(struct bch_write_op *, struct cache_set *, struct bkey_s_c, struct bkey_s_c, unsigned); void bch_write(struct closure *); +struct cache_promote_op; + +struct bch_read_bio { + struct bio *parent; + struct bvec_iter parent_iter; + + struct cache_set *c; + unsigned flags; + + /* fields align with bch_extent_crc64 */ + u64 bounce:3, + compressed_size:18, + uncompressed_size:18, + offset:17, + csum_type:4, + compression_type:4; + u64 csum; + + struct cache_promote_op *promote; + + struct llist_node list; + struct bbio bio; +}; + +struct extent_pick_ptr; + +void bch_read_extent(struct cache_set *, struct bio *, struct bkey_s_c, + struct extent_pick_ptr *, unsigned, unsigned); + +enum bch_read_flags { + BCH_READ_FORCE_BOUNCE = 1 << 0, + BCH_READ_RETRY_IF_STALE = 1 << 1, + BCH_READ_PROMOTE = 1 << 2, +}; + int bch_read(struct cache_set *, struct bio *, u64); void bch_cache_io_error_work(struct work_struct *); @@ -104,8 +138,7 @@ void bch_bbio_endio(struct bbio *, int, const char *); void bch_generic_make_request(struct bio *, struct cache_set *); void bch_bio_submit_work(struct work_struct *); -void bch_bbio_prep(struct bbio *, struct cache *); -void bch_submit_bbio(struct bbio *, struct cache *, const struct bkey_i *, +void bch_submit_bbio(struct bbio *, struct cache *, const struct bch_extent_ptr *, bool); void bch_submit_bbio_replicas(struct bch_write_bio *, struct cache_set *, const struct bkey_i *, unsigned, bool); @@ -119,6 +152,8 @@ bool cache_promote(struct cache_set *, struct bbio *, struct bkey_s_c); void bch_read_race_work(struct work_struct *); void bch_wake_delayed_writes(unsigned long data); +void bch_bio_decompress_work(struct work_struct *); + extern struct workqueue_struct *bcache_io_wq; #endif /* _BCACHE_IO_H */ diff --git a/drivers/md/bcache/io_types.h b/drivers/md/bcache/io_types.h new file mode 100644 index 000000000000..2a8e7c6a7386 --- /dev/null +++ b/drivers/md/bcache/io_types.h @@ -0,0 +1,12 @@ +#ifndef _BCACHE_IO_TYPES_H +#define _BCACHE_IO_TYPES_H + +#include <linux/llist.h> +#include <linux/workqueue.h> + +struct bio_decompress_worker { + struct work_struct work; + struct llist_head bio_list; +}; + +#endif /* _BCACHE_IO_TYPES_H */ diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 0a7550a0294d..08879ba95be7 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -1170,7 +1170,7 @@ static void journal_next_bucket(struct cache_set *c) struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); struct bch_extent_ptr *ptr; struct cache *ca; - unsigned iter; + unsigned iter, replicas; lockdep_assert_held(&j->lock); @@ -1198,7 +1198,11 @@ static void journal_next_bucket(struct cache_set *c) if (!(ca = PTR_CACHE(c, ptr)) || CACHE_STATE(&ca->mi) != CACHE_ACTIVE || ca->journal.sectors_free <= j->sectors_free) - bch_extent_drop_ptr(e, ptr); + __bch_extent_drop_ptr(e, ptr); + + replicas = 0; + extent_for_each_ptr(e, ptr) + replicas++; /* * Determine location of the next journal write: @@ -1209,7 +1213,7 @@ static void journal_next_bucket(struct cache_set *c) unsigned next, remaining, nr_buckets = bch_nr_journal_buckets(&ca->sb); - if (bch_extent_ptrs(e) == CACHE_SET_META_REPLICAS_WANT(&c->sb)) + if (replicas >= CACHE_SET_META_REPLICAS_WANT(&c->sb)) break; /* @@ -1247,28 +1251,25 @@ static void journal_next_bucket(struct cache_set *c) if (!remaining) continue; - BUG_ON(bch_extent_ptrs(e) >= BKEY_EXTENT_PTRS_MAX); - ja->sectors_free = ca->mi.bucket_size; - ja->cur_idx = next; - e.v->ptr[bch_extent_ptrs(e)] = (struct bch_extent_ptr) { - .gen = 0, - .dev = ca->sb.nr_this_dev, - .offset = bucket_to_sector(ca, - journal_bucket(ca, ja->cur_idx)), - }; - ja->bucket_seq[ja->cur_idx] = j->seq; + extent_ptr_append(bkey_i_to_extent(&j->key), + (struct bch_extent_ptr) { + .offset = bucket_to_sector(ca, + journal_bucket(ca, ja->cur_idx)), + .dev = ca->sb.nr_this_dev, + }); + replicas++; + trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx); - bch_set_extent_ptrs(e, bch_extent_ptrs(e) + 1); } /* set j->sectors_free to the min of any device */ j->sectors_free = UINT_MAX; - if (bch_extent_ptrs(e) == CACHE_SET_META_REPLICAS_WANT(&c->sb)) + if (replicas >= CACHE_SET_META_REPLICAS_WANT(&c->sb)) extent_for_each_online_device(c, e, ptr, ca) j->sectors_free = min(j->sectors_free, ca->journal.sectors_free); diff --git a/drivers/md/bcache/migrate.c b/drivers/md/bcache/migrate.c index 66bf35c082c5..4ee369a4b7a9 100644 --- a/drivers/md/bcache/migrate.c +++ b/drivers/md/bcache/migrate.c @@ -136,14 +136,6 @@ static enum migrate_option migrate_cleanup_key(struct cache_set *c, return MIGRATE_IGNORE; } - /* - * Remove all pointers, to avoid too many in a tier. - * migrate_compact_key above does the same when nr_replicas is 1, and - * doesn't actually work if nr_replicas > 1, so do something simple - * instead. Effectively, every migration copy is a fresh 'foreground' - * write. - */ - bch_set_extent_ptrs(e, 0); return MIGRATE_COPY; } diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c index 2aed02880a36..87dcac33cb4b 100644 --- a/drivers/md/bcache/move.c +++ b/drivers/md/bcache/move.c @@ -427,9 +427,8 @@ void bch_queue_recalc_oldest_gens(struct cache_set *c, struct moving_queue *q) static void read_moving_endio(struct bio *bio) { - struct bbio *b = container_of(bio, struct bbio, bio); - struct moving_io *io = container_of(bio->bi_private, - struct moving_io, cl); + struct closure *cl = bio->bi_private; + struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_queue *q = io->q; struct moving_context *ctxt = io->context; bool stopped; @@ -439,11 +438,9 @@ static void read_moving_endio(struct bio *bio) if (bio->bi_error) { io->op.error = bio->bi_error; moving_error(io->context, MOVING_FLAG_READ); - } else if (ptr_stale(b->ca, &bkey_i_to_extent_c(&b->key)->v.ptr[0])) { - io->op.error = -EINTR; } - bch_bbio_endio(b, bio->bi_error, "reading data to move"); + bio_put(bio); spin_lock_irqsave(&q->lock, flags); @@ -488,7 +485,10 @@ static void __bch_data_move(struct closure *cl) bio_set_op_attrs(&io->bio.bio.bio, REQ_OP_READ, 0); io->bio.bio.bio.bi_end_io = read_moving_endio; - bch_submit_bbio(&io->bio.bio, pick.ca, &io->key, &pick.ptr, false); + bch_read_extent(io->op.c, &io->bio.bio.bio, + bkey_i_to_s_c(&io->key), + &pick, 0, 0); + bio_endio(&io->bio.bio.bio); } /* diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 17a910ef114e..167c2f185f0e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -364,9 +364,9 @@ static int cached_dev_cache_miss(struct btree_iter *iter, struct search *s, miss->bi_end_io = request_endio; miss->bi_private = &s->cl; - to_bbio(miss)->key.k = KEY(s->inode, - bio_end_sector(miss), - bio_sectors(miss)); + //to_bbio(miss)->key.k = KEY(s->inode, + // bio_end_sector(miss), + // bio_sectors(miss)); to_bbio(miss)->ca = NULL; closure_get(&s->cl); @@ -375,7 +375,7 @@ static int cached_dev_cache_miss(struct btree_iter *iter, struct search *s, bkey_to_s_c(&KEY(replace.key.k.p.inode, replace.key.k.p.offset, replace.key.k.size)), - BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED); + BCH_WRITE_CACHED); return 0; nopromote: @@ -388,23 +388,6 @@ nopromote: return 0; } -static void bch_cache_read_endio(struct bio *bio) -{ - struct bbio *b = to_bbio(bio); - struct closure *cl = bio->bi_private; - struct search *s = container_of(cl, struct search, cl); - - if (bio->bi_error) - s->iop.error = bio->bi_error; - else if (ptr_stale(b->ca, &b->ptr)) { - /* Read bucket invalidate race */ - atomic_long_inc(&s->iop.c->cache_read_races); - s->iop.error = -EINTR; - } - - bch_bbio_endio(b, bio->bi_error, "reading from cache"); -} - static void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; @@ -417,9 +400,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s) for_each_btree_key_with_holes(&iter, s->iop.c, BTREE_ID_EXTENTS, POS(s->inode, bio->bi_iter.bi_sector), k) { struct extent_pick_ptr pick; - struct bio *n; - struct bbio *bbio; - unsigned sectors; + unsigned sectors, bytes; bool done; retry: BUG_ON(bkey_cmp(bkey_start_pos(k.k), @@ -428,8 +409,12 @@ retry: BUG_ON(bkey_cmp(k.k->p, POS(s->inode, bio->bi_iter.bi_sector)) <= 0); - sectors = k.k->p.offset - bio->bi_iter.bi_sector; - done = sectors >= bio_sectors(bio); + sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) - + bio->bi_iter.bi_sector; + bytes = sectors << 9; + done = bytes == bio->bi_iter.bi_size; + + swap(bio->bi_iter.bi_size, bytes); pick = bch_extent_pick_ptr(s->iop.c, k); if (IS_ERR(pick.ca)) { @@ -452,33 +437,17 @@ retry: if (!bkey_extent_is_cached(k.k)) s->read_dirty_data = true; - n = bio_next_split(bio, sectors, GFP_NOIO, - &s->d->bio_split); - - bbio = to_bbio(n); - bbio->key.k = *k.k; - bbio->ptr = pick.ptr; - bch_set_extent_ptrs(bkey_i_to_s_extent(&bbio->key), 1); - - /* Trim the key to match what we're actually reading */ - bch_cut_front(POS(s->inode, n->bi_iter.bi_sector), - &bbio->key); - bch_cut_back(POS(s->inode, bio_end_sector(n)), - &bbio->key.k); - - bch_bbio_prep(bbio, pick.ca); - - n->bi_end_io = bch_cache_read_endio; - n->bi_private = &s->cl; - - closure_get(&s->cl); - if (!s->bypass) { - if (cache_promote(s->iop.c, bbio, k)) - s->cache_miss = 1; - } else - submit_bio(n); + bch_read_extent(s->iop.c, bio, k, &pick, + bio->bi_iter.bi_sector - + bkey_start_offset(k.k), + BCH_READ_FORCE_BOUNCE| + BCH_READ_RETRY_IF_STALE| + (!s->bypass ? BCH_READ_PROMOTE : 0)); } + swap(bio->bi_iter.bi_size, bytes); + bio_advance(bio, bytes); + if (done) { bch_btree_iter_unlock(&iter); goto out; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 61047a66fb0c..05c62ca25f74 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -142,6 +142,8 @@ static const char *bch_blkdev_open(const char *path, void *holder, if (IS_ERR(bdev)) return "failed to open device"; + bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; + *ret = bdev; return NULL; } @@ -764,9 +766,12 @@ static void cache_set_free(struct closure *cl) bch_io_clock_exit(&c->io_clock[WRITE]); bch_io_clock_exit(&c->io_clock[READ]); bdi_destroy(&c->bdi); - bioset_exit(&c->btree_read_bio); + free_percpu(c->bio_decompress_worker); + mempool_exit(&c->compression_workspace_pool); + mempool_exit(&c->bio_bounce_pages); bioset_exit(&c->bio_write); - bioset_exit(&c->bio_split); + bioset_exit(&c->bio_read); + bioset_exit(&c->btree_read_bio); mempool_exit(&c->btree_reserve_pool); mempool_exit(&c->fill_iter); mempool_exit(&c->search); @@ -893,6 +898,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) { struct cache_set *c; unsigned iter_size; + int cpu; c = kzalloc(sizeof(struct cache_set), GFP_KERNEL); if (!c) @@ -952,9 +958,9 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freeable); INIT_LIST_HEAD(&c->btree_cache_freed); + mutex_init(&c->bio_bounce_pages_lock); INIT_WORK(&c->bio_submit_work, bch_bio_submit_work); spin_lock_init(&c->bio_submit_lock); - bio_list_init(&c->read_race_list); spin_lock_init(&c->read_race_lock); INIT_WORK(&c->read_race_work, bch_read_race_work); @@ -992,9 +998,14 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1, BTREE_RESERVE_SIZE) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || - bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio)) || - bioset_init(&c->bio_write, 4, offsetof(struct bch_write_bio, bio.bio)) || bioset_init(&c->btree_read_bio, 1, offsetof(struct bbio, bio)) || + bioset_init(&c->bio_read, 4, offsetof(struct bch_read_bio, bio.bio)) || + bioset_init(&c->bio_write, 4, offsetof(struct bch_write_bio, bio.bio)) || + mempool_init_page_pool(&c->bio_bounce_pages, + CRC32_EXTENT_SIZE_MAX / PAGE_SECTORS, 0) || + mempool_init_page_pool(&c->compression_workspace_pool, 1, + get_order(COMPRESSION_WORKSPACE_SIZE)) || + !(c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker)) || bdi_setup_and_register(&c->bdi, "bcache") || bch_io_clock_init(&c->io_clock[READ]) || bch_io_clock_init(&c->io_clock[WRITE]) || @@ -1003,9 +1014,18 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bch_bset_sort_state_init(&c->sort, ilog2(btree_pages(c)))) goto err; + for_each_possible_cpu(cpu) { + struct bio_decompress_worker *d = + per_cpu_ptr(c->bio_decompress_worker, cpu); + + INIT_WORK(&d->work, bch_bio_decompress_work); + init_llist_head(&d->bio_list); + } + c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; c->bdi.congested_fn = bch_congested_fn; c->bdi.congested_data = c; + c->bdi.capabilities |= BDI_CAP_STABLE_WRITES; return c; err: diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c index ed05c6f4d412..62108446b82d 100644 --- a/drivers/md/bcache/tier.c +++ b/drivers/md/bcache/tier.c @@ -24,30 +24,23 @@ static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k) if (bkey_extent_is_data(k.k)) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr; struct cache_member_rcu *mi; - unsigned replicas = CACHE_SET_DATA_REPLICAS_WANT(&c->sb); - unsigned dev; - bool ret = false; + unsigned replicas = 0; - /* - * Should not happen except in a pathological situation (too - * many pointers on the wrong tier? - */ - if (bch_extent_ptrs(e) == BKEY_EXTENT_PTRS_MAX) + /* Make sure we have room to add a new pointer: */ + if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_MAX_U64s > + BKEY_EXTENT_VAL_U64s_MAX) return false; - /* - * Need at least CACHE_SET_DATA_REPLICAS_WANT ptrs not on tier 0 - */ - if (bch_extent_ptrs(e) < replicas) - return true; - - dev = e.v->ptr[bch_extent_ptrs(e) - replicas].dev; mi = cache_member_info_get(c); - ret = dev < mi->nr_in_set && !CACHE_TIER(&mi->m[dev]); + extent_for_each_ptr(e, ptr) + if (ptr->dev < mi->nr_in_set && + CACHE_TIER(&mi->m[ptr->dev])) + replicas++; cache_member_info_put(); - return ret; + return replicas < CACHE_SET_DATA_REPLICAS_WANT(&c->sb); } return false; diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index f03453ee69f1..2ca58a386cdf 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -621,13 +621,11 @@ TRACE_EVENT(bcache_btree_insert_key, __field(u64, b_bucket ) __field(u64, b_offset ) __field(u64, offset ) - __field(u64, bucket ) __field(u32, b_inode ) __field(u32, inode ) __field(u32, size ) __field(u8, level ) __field(u8, id ) - __field(u8, cached ) __field(u8, op ) __field(u8, insert_done ) ), @@ -638,22 +636,18 @@ TRACE_EVENT(bcache_btree_insert_key, __entry->id = b->btree_id; __entry->b_inode = b->key.k.p.inode; __entry->b_offset = b->key.k.p.offset; - __entry->bucket = PTR_BUCKET_NR_TRACE(b->c, k, 0); __entry->inode = k->k.p.inode; __entry->offset = k->k.p.offset; __entry->size = k->k.size; - __entry->cached = bkey_extent_is_cached(&k->k); __entry->op = op; __entry->insert_done = insert_done; ), - TP_printk("%u for %u bucket %llu(%u) id %u: %u:%llu %u:%llu len %u%s -> %llu", + TP_printk("%u for %u bucket %llu(%u) id %u: %u:%llu %u:%llu len %u", __entry->insert_done, __entry->op, __entry->b_bucket, __entry->level, __entry->id, __entry->b_inode, __entry->b_offset, - __entry->inode, __entry->offset, - __entry->size, __entry->cached ? " cached" : "", - __entry->bucket) + __entry->inode, __entry->offset, __entry->size) ); DECLARE_EVENT_CLASS(btree_split, diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 8ea6758301a7..a5ab2935c146 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -244,19 +244,139 @@ BKEY_VAL_TYPE(cookie, KEY_TYPE_COOKIE); /* Extents */ /* - * bcache keys index the end of the extent as the offset - * The end is exclusive, while the start is inclusive + * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally + * preceded by checksum/compression information (bch_extent_crc32 or + * bch_extent_crc64). + * + * One major determining factor in the format of extents is how we handle and + * represent extents that have been partially overwritten and thus trimmed: + * + * If an extent is not checksummed or compressed, when the extent is trimmed we + * don't have to remember the extent we originally allocated and wrote: we can + * merely adjust ptr->offset to point to the start of the start of the data that + * is currently live. The size field in struct bkey records the current (live) + * size of the extent, and is also used to mean "size of region on disk that we + * point to" in this case. + * + * Thus an extent that is not checksummed or compressed will consist only of a + * list of bch_extent_ptrs, with none of the fields in + * bch_extent_crc32/bch_extent_crc64. + * + * When an extent is checksummed or compressed, it's not possible to read only + * the data that is currently live: we have to read the entire extent that was + * originally written, and then return only the part of the extent that is + * currently live. + * + * Thus, in addition to the current size of the extent in struct bkey, we need + * to store the size of the originally allocated space - this is the + * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, + * when the extent is trimmed, instead of modifying the offset field of the + * pointer, we keep a second smaller offset field - "offset into the original + * extent of the currently live region". + * + * The other major determining factor is replication and data migration: + * + * Each pointer may have its own bch_extent_crc32/64. When doing a replicated + * write, we will initially write all the replicas in the same format, with the + * same checksum type and compression format - however, when copygc runs later (or + * tiering/cache promotion, anything that moves data), it is not in general + * going to rewrite all the pointers at once - one of the replicas may be in a + * bucket on one device that has very little fragmentation while another lives + * in a bucket that has become heavily fragmented, and thus is being rewritten + * sooner than the rest. + * + * Thus it will only move a subset of the pointers (or in the case of + * tiering/cache promotion perhaps add a single pointer without dropping any + * current pointers), and if the extent has been partially overwritten it must + * write only the currently live portion (or copygc would not be able to reduce + * fragmentation!) - which necessitates a different bch_extent_crc format for + * the new pointer. + * + * But in the interests of space efficiency, we don't want to store one + * bch_extent_crc for each pointer if we don't have to. + * + * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and + * bch_extent_ptrs appended arbitrarily one after the other. We determine the + * type of a given entry with a scheme similar to utf8 (except we're encoding a + * type, not a size), encoding the type in the position of the first set bit: + * + * bch_extent_crc32 - 0b1 + * bch_extent_ptr - 0b10 + * bch_extent_crc64 - 0b100 + * + * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and + * bch_extent_crc64 is the least constrained). + * + * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, + * until the next bch_extent_crc32/64. + * + * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer + * is neither checksummed nor compressed. */ +enum bch_extent_entry_type { + BCH_EXTENT_ENTRY_crc32 = 0, + BCH_EXTENT_ENTRY_ptr = 1, + BCH_EXTENT_ENTRY_crc64 = 2, +}; + +#define BCH_EXTENT_ENTRY_MAX 3 + +struct bch_extent_crc32 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 type:1, + offset:7, + compressed_size:8, + uncompressed_size:8, + csum_type:4, + compression_type:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u32 csum_type:4, + compression_type:4, + uncompressed_size:8, + compressed_size:8, + offset:7, + type:1; +#endif + __u32 csum; +} __attribute__((packed)) __attribute__((aligned(8))); + +#define CRC32_EXTENT_SIZE_MAX (1U << 7) + +struct bch_extent_crc64 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:3, + compressed_size:18, + uncompressed_size:18, + offset:17, + csum_type:4, + compression_type:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 csum_type:4, + compression_type:4, + offset:17, + uncompressed_size:18, + compressed_size:18, + type:3; +#endif + __u64 csum; +} __attribute__((packed)) __attribute__((aligned(8))); + +#define CRC64_EXTENT_SIZE_MAX (1U << 17) + struct bch_extent_ptr { #if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 gen:8, + __u64 type:2, + erasure_coded:1, + offset:45, /* 16 petabytes */ dev:8, - offset:48; + gen:8; #elif defined (__BIG_ENDIAN_BITFIELD) - __u64 offset:48, + __u64 gen:8, dev:8, - gen:8; + offset:45, + erasure_coded:1, + type:2; #endif } __attribute__((packed)) __attribute__((aligned(8))); @@ -264,6 +384,13 @@ struct bch_extent_ptr { #define PTR_LOST_DEV 255 /* XXX: kill */ +union bch_extent_entry { + __u8 type; + struct bch_extent_crc32 crc32; + struct bch_extent_crc64 crc64; + struct bch_extent_ptr ptr; +}; + enum { BCH_EXTENT = 128, @@ -277,9 +404,10 @@ enum { struct bch_extent { struct bch_val v; - struct bch_extent_ptr ptr[0]; + + union bch_extent_entry start[0]; __u64 _data[0]; -}; +} __attribute__((packed)) __attribute__((aligned(8))); BKEY_VAL_TYPE(extent, BCH_EXTENT); /* Inodes */ @@ -552,6 +680,18 @@ enum { BCH_DIRENT_CSUM_SHA1 = 3, }; +BITMASK(CACHE_DATA_PREFERRED_CSUM_TYPE, struct cache_sb, flags, 48, 52); + +BITMASK(CACHE_COMPRESSION_TYPE, struct cache_sb, flags, 52, 56); +enum { + BCH_COMPRESSION_NONE = 0, + BCH_COMPRESSION_LZO1X = 1, + BCH_COMPRESSION_GZIP = 2, + BCH_COMPRESSION_XZ = 3, +}; + +/* backing device specific stuff: */ + BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); #define CACHE_MODE_WRITETHROUGH 0U #define CACHE_MODE_WRITEBACK 1U |