diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2020-06-24 14:27:48 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2020-06-28 18:12:46 -0400 |
commit | 7b42d65c0607af212b9ef3aa834ea449e3f846b1 (patch) | |
tree | 44b973d839aacebf7564169a010d2c7a9e1c88cf | |
parent | 919b1e2379b8f5ae68ddb4a63ca208373bae9fdc (diff) |
bcache: Header refactoring
For the bcache + bcachefs layer, we're going to need to reorganize the
headers so we can have code that interfaces to bcachefs and parts of
bcache.
-rw-r--r-- | drivers/md/bcache/backingdev.h | 251 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 205 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 168 | ||||
-rw-r--r-- | drivers/md/bcache/io.h | 26 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 152 | ||||
-rw-r--r-- | drivers/md/bcache/request.h | 1 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/super.h | 34 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.h | 58 | ||||
-rw-r--r-- | include/uapi/linux/bcache.h | 171 | ||||
-rw-r--r-- | include/uapi/linux/bcache_superblock.h | 177 |
16 files changed, 667 insertions, 583 deletions
diff --git a/drivers/md/bcache/backingdev.h b/drivers/md/bcache/backingdev.h new file mode 100644 index 000000000000..58362eb7902a --- /dev/null +++ b/drivers/md/bcache/backingdev.h @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_BACKINGDEV_H +#define _BCACHE_BACKINGDEV_H + +#include <linux/bio.h> +#include <linux/closure.h> +#include <linux/kobject.h> +#include <linux/list.h> + +#include <linux/bcache_superblock.h> +#include <linux/bcache/ratelimit.h> + +#include "stats.h" +#include "super.h" + +struct search; +struct btree; + +struct bcache_device { + struct closure cl; + + struct kobject kobj; + + struct cache_set *c; + unsigned int id; +#define BCACHEDEVNAME_SIZE 12 + char name[BCACHEDEVNAME_SIZE]; + + struct gendisk *disk; + + unsigned long flags; +#define BCACHE_DEV_CLOSING 0 +#define BCACHE_DEV_DETACHING 1 +#define BCACHE_DEV_UNLINK_DONE 2 +#define BCACHE_DEV_WB_RUNNING 3 +#define BCACHE_DEV_RATE_DW_RUNNING 4 + unsigned int nr_stripes; + unsigned int stripe_size; + atomic_t *stripe_sectors_dirty; + unsigned long *full_dirty_stripes; + + struct bio_set bio_split; + + unsigned int data_csum:1; + + int (*cache_miss)(struct btree *b, struct search *s, + struct bio *bio, unsigned int sectors); + int (*ioctl)(struct bcache_device *d, fmode_t mode, + unsigned int cmd, unsigned long arg); +}; + +enum stop_on_failure { + BCH_CACHED_DEV_STOP_AUTO = 0, + BCH_CACHED_DEV_STOP_ALWAYS, + BCH_CACHED_DEV_STOP_MODE_MAX, +}; + +struct io { + /* Used to track sequential IO so it can be skipped */ + struct hlist_node hash; + struct list_head lru; + + unsigned long jiffies; + unsigned int sequential; + sector_t last; +}; + +struct cached_dev { + struct list_head list; + struct bcache_device disk; + struct block_device *bdev; + + struct cache_sb sb; + struct cache_sb_disk *sb_disk; + struct bio sb_bio; + struct bio_vec sb_bv[1]; + struct closure sb_write; + struct semaphore sb_write_mutex; + + /* Refcount on the cache set. Always nonzero when we're caching. */ + refcount_t count; + struct work_struct detach; + + /* + * Device might not be running if it's dirty and the cache set hasn't + * showed up yet. + */ + atomic_t running; + + /* + * Writes take a shared lock from start to finish; scanning for dirty + * data to refill the rb tree requires an exclusive lock. + */ + struct rw_semaphore writeback_lock; + + /* + * Nonzero, and writeback has a refcount (d->count), iff there is dirty + * data in the cache. Protected by writeback_lock; must have an + * shared lock to set and exclusive lock to clear. + */ + atomic_t has_dirty; + +#define BCH_CACHE_READA_ALL 0 +#define BCH_CACHE_READA_META_ONLY 1 + unsigned int cache_readahead_policy; + struct bch_ratelimit writeback_rate; + struct delayed_work writeback_rate_update; + + /* Limit number of writeback bios in flight */ + struct semaphore in_flight; + struct task_struct *writeback_thread; + struct workqueue_struct *writeback_write_wq; + + struct keybuf *writeback_keys; + + struct task_struct *status_update_thread; + /* + * Order the write-half of writeback operations strongly in dispatch + * order. (Maintain LBA order; don't allow reads completing out of + * order to re-order the writes...) + */ + struct closure_waitlist writeback_ordering_wait; + atomic_t writeback_sequence_next; + + /* For tracking sequential IO */ +#define RECENT_IO_BITS 7 +#define RECENT_IO (1 << RECENT_IO_BITS) + struct io io[RECENT_IO]; + struct hlist_head io_hash[RECENT_IO + 1]; + struct list_head io_lru; + spinlock_t io_lock; + + struct cache_accounting accounting; + + /* The rest of this all shows up in sysfs */ + unsigned int sequential_cutoff; + unsigned int readahead; + + unsigned int io_disable:1; + unsigned int verify:1; + unsigned int bypass_torture_test:1; + + unsigned int partial_stripes_expensive:1; + unsigned int writeback_metadata:1; + unsigned int writeback_running:1; + unsigned char writeback_percent; + unsigned int writeback_delay; + + uint64_t writeback_rate_target; + int64_t writeback_rate_proportional; + int64_t writeback_rate_integral; + int64_t writeback_rate_integral_scaled; + int32_t writeback_rate_change; + + unsigned int writeback_rate_update_seconds; + unsigned int writeback_rate_i_term_inverse; + unsigned int writeback_rate_p_term_inverse; + unsigned int writeback_rate_minimum; + + enum stop_on_failure stop_when_cache_set_failed; +#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 + atomic_t io_errors; + unsigned int error_limit; + unsigned int offline_seconds; + + char backing_dev_name[BDEVNAME_SIZE]; +}; + +static inline unsigned int cache_mode(struct cached_dev *dc) +{ + return BDEV_CACHE_MODE(&dc->sb); +} + +extern unsigned int bch_cutoff_writeback; +extern unsigned int bch_cutoff_writeback_sync; + +static inline void cached_dev_put(struct cached_dev *dc) +{ + if (refcount_dec_and_test(&dc->count)) + schedule_work(&dc->detach); +} + +static inline bool cached_dev_get(struct cached_dev *dc) +{ + if (!refcount_inc_not_zero(&dc->count)) + return false; + + /* Paired with the mb in cached_dev_attach */ + smp_mb__after_atomic(); + return true; +} + +static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) +{ + uint64_t i, ret = 0; + + for (i = 0; i < d->nr_stripes; i++) + ret += atomic_read(d->stripe_sectors_dirty + i); + + return ret; +} + +static inline unsigned int offset_to_stripe(struct bcache_device *d, + uint64_t offset) +{ + do_div(offset, d->stripe_size); + return offset; +} + +static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, + uint64_t offset, + unsigned int nr_sectors) +{ + unsigned int stripe = offset_to_stripe(&dc->disk, offset); + + while (1) { + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) + return true; + + if (nr_sectors <= dc->disk.stripe_size) + return false; + + nr_sectors -= dc->disk.stripe_size; + stripe++; + } +} + +static inline void bch_writeback_queue(struct cached_dev *dc) +{ + if (!IS_ERR_OR_NULL(dc->writeback_thread)) + wake_up_process(dc->writeback_thread); +} + +static inline void bch_writeback_add(struct cached_dev *dc) +{ + if (!atomic_read(&dc->has_dirty) && + !atomic_xchg(&dc->has_dirty, 1)) { + if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { + SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); + /* XXX: should do this synchronously */ + bch_write_bdev_super(dc, NULL); + } + + bch_writeback_queue(dc); + } +} + +#define CUTOFF_CACHE_ADD 95 +#define CUTOFF_CACHE_READA 90 + +#endif /* _BCACHE_BACKINGDEV_H */ diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 646ce2bacb3c..b779098c6aa1 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -192,6 +192,7 @@ #include <linux/workqueue.h> #include <linux/kthread.h> +#include "backingdev.h" #include "bset.h" #include "util.h" @@ -247,156 +248,6 @@ struct keybuf { DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); }; -struct bcache_device { - struct closure cl; - - struct kobject kobj; - - struct cache_set *c; - unsigned int id; -#define BCACHEDEVNAME_SIZE 12 - char name[BCACHEDEVNAME_SIZE]; - - struct gendisk *disk; - - unsigned long flags; -#define BCACHE_DEV_CLOSING 0 -#define BCACHE_DEV_DETACHING 1 -#define BCACHE_DEV_UNLINK_DONE 2 -#define BCACHE_DEV_WB_RUNNING 3 -#define BCACHE_DEV_RATE_DW_RUNNING 4 - unsigned int nr_stripes; - unsigned int stripe_size; - atomic_t *stripe_sectors_dirty; - unsigned long *full_dirty_stripes; - - struct bio_set bio_split; - - unsigned int data_csum:1; - - int (*cache_miss)(struct btree *b, struct search *s, - struct bio *bio, unsigned int sectors); - int (*ioctl)(struct bcache_device *d, fmode_t mode, - unsigned int cmd, unsigned long arg); -}; - -struct io { - /* Used to track sequential IO so it can be skipped */ - struct hlist_node hash; - struct list_head lru; - - unsigned long jiffies; - unsigned int sequential; - sector_t last; -}; - -enum stop_on_failure { - BCH_CACHED_DEV_STOP_AUTO = 0, - BCH_CACHED_DEV_STOP_ALWAYS, - BCH_CACHED_DEV_STOP_MODE_MAX, -}; - -struct cached_dev { - struct list_head list; - struct bcache_device disk; - struct block_device *bdev; - - struct cache_sb sb; - struct cache_sb_disk *sb_disk; - struct bio sb_bio; - struct bio_vec sb_bv[1]; - struct closure sb_write; - struct semaphore sb_write_mutex; - - /* Refcount on the cache set. Always nonzero when we're caching. */ - refcount_t count; - struct work_struct detach; - - /* - * Device might not be running if it's dirty and the cache set hasn't - * showed up yet. - */ - atomic_t running; - - /* - * Writes take a shared lock from start to finish; scanning for dirty - * data to refill the rb tree requires an exclusive lock. - */ - struct rw_semaphore writeback_lock; - - /* - * Nonzero, and writeback has a refcount (d->count), iff there is dirty - * data in the cache. Protected by writeback_lock; must have an - * shared lock to set and exclusive lock to clear. - */ - atomic_t has_dirty; - -#define BCH_CACHE_READA_ALL 0 -#define BCH_CACHE_READA_META_ONLY 1 - unsigned int cache_readahead_policy; - struct bch_ratelimit writeback_rate; - struct delayed_work writeback_rate_update; - - /* Limit number of writeback bios in flight */ - struct semaphore in_flight; - struct task_struct *writeback_thread; - struct workqueue_struct *writeback_write_wq; - - struct keybuf *writeback_keys; - - struct task_struct *status_update_thread; - /* - * Order the write-half of writeback operations strongly in dispatch - * order. (Maintain LBA order; don't allow reads completing out of - * order to re-order the writes...) - */ - struct closure_waitlist writeback_ordering_wait; - atomic_t writeback_sequence_next; - - /* For tracking sequential IO */ -#define RECENT_IO_BITS 7 -#define RECENT_IO (1 << RECENT_IO_BITS) - struct io io[RECENT_IO]; - struct hlist_head io_hash[RECENT_IO + 1]; - struct list_head io_lru; - spinlock_t io_lock; - - struct cache_accounting accounting; - - /* The rest of this all shows up in sysfs */ - unsigned int sequential_cutoff; - unsigned int readahead; - - unsigned int io_disable:1; - unsigned int verify:1; - unsigned int bypass_torture_test:1; - - unsigned int partial_stripes_expensive:1; - unsigned int writeback_metadata:1; - unsigned int writeback_running:1; - unsigned char writeback_percent; - unsigned int writeback_delay; - - uint64_t writeback_rate_target; - int64_t writeback_rate_proportional; - int64_t writeback_rate_integral; - int64_t writeback_rate_integral_scaled; - int32_t writeback_rate_change; - - unsigned int writeback_rate_update_seconds; - unsigned int writeback_rate_i_term_inverse; - unsigned int writeback_rate_p_term_inverse; - unsigned int writeback_rate_minimum; - - enum stop_on_failure stop_when_cache_set_failed; -#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 - atomic_t io_errors; - unsigned int error_limit; - unsigned int offline_seconds; - - char backing_dev_name[BDEVNAME_SIZE]; -}; - enum alloc_reserve { RESERVE_BTREE, RESERVE_PRIO, @@ -876,22 +727,6 @@ do { \ for (b = (ca)->buckets + (ca)->sb.first_bucket; \ b < (ca)->buckets + (ca)->sb.nbuckets; b++) -static inline void cached_dev_put(struct cached_dev *dc) -{ - if (refcount_dec_and_test(&dc->count)) - schedule_work(&dc->detach); -} - -static inline bool cached_dev_get(struct cached_dev *dc) -{ - if (!refcount_inc_not_zero(&dc->count)) - return false; - - /* Paired with the mb in cached_dev_attach */ - smp_mb__after_atomic(); - return true; -} - /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree (last_gc). @@ -949,20 +784,6 @@ static inline void wait_for_kthread_stop(void) /* Forward declarations */ -void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); -void bch_count_io_errors(struct cache *ca, blk_status_t error, - int is_read, const char *m); -void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, - blk_status_t error, const char *m); -void bch_bbio_endio(struct cache_set *c, struct bio *bio, - blk_status_t error, const char *m); -void bch_bbio_free(struct bio *bio, struct cache_set *c); -struct bio *bch_bbio_alloc(struct cache_set *c); - -void __bch_submit_bbio(struct bio *bio, struct cache_set *c); -void bch_submit_bbio(struct bio *bio, struct cache_set *c, - struct bkey *k, unsigned int ptr); - uint8_t bch_inc_gen(struct cache *ca, struct bucket *b); void bch_rescale_priorities(struct cache_set *c, int sectors); @@ -980,14 +801,10 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned int sectors, unsigned int write_point, unsigned int write_prio, bool wait); -bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); -int bch_prio_write(struct cache *ca, bool wait); -void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); - extern struct workqueue_struct *bcache_wq; extern struct workqueue_struct *bch_journal_wq; extern struct mutex bch_register_lock; @@ -999,26 +816,6 @@ extern struct kobj_type bch_cache_set_ktype; extern struct kobj_type bch_cache_set_internal_ktype; extern struct kobj_type bch_cache_ktype; -void bch_cached_dev_release(struct kobject *kobj); -void bch_flash_dev_release(struct kobject *kobj); -void bch_cache_set_release(struct kobject *kobj); -void bch_cache_release(struct kobject *kobj); - -int bch_uuid_write(struct cache_set *c); -void bcache_write_super(struct cache_set *c); - -int bch_flash_dev_create(struct cache_set *c, uint64_t size); - -int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, - uint8_t *set_uuid); -void bch_cached_dev_detach(struct cached_dev *dc); -int bch_cached_dev_run(struct cached_dev *dc); -void bcache_device_stop(struct bcache_device *d); - -void bch_cache_set_unregister(struct cache_set *c); -void bch_cache_set_stop(struct cache_set *c); - -struct cache_set *bch_cache_set_alloc(struct cache_sb *sb); void bch_btree_cache_free(struct cache_set *c); int bch_btree_cache_alloc(struct cache_set *c); void bch_moving_init_cache_set(struct cache_set *c); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index b1b91f869641..a08d8d6db2de 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -25,6 +25,7 @@ #include "btree.h" #include "debug.h" #include "extents.h" +#include "io.h" #include <linux/slab.h> #include <linux/bitops.h> diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 336f43910383..efb7fb837d7e 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -10,6 +10,7 @@ #include "btree.h" #include "debug.h" #include "extents.h" +#include "io.h" #include <linux/console.h> #include <linux/debugfs.h> diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 4d93f07f63e5..8ba675b8a2da 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -7,10 +7,178 @@ */ #include "bcache.h" +#include "backingdev.h" #include "bset.h" #include "debug.h" #include <linux/blkdev.h> +#include <linux/random.h> + +#include <trace/events/bcache.h> + +/* + * Congested? Return 0 (not congested) or the limit (in sectors) + * beyond which we should bypass the cache due to congestion. + */ +unsigned int bch_get_congested(const struct cache_set *c) +{ + int i; + + if (!c->congested_read_threshold_us && + !c->congested_write_threshold_us) + return 0; + + i = (local_clock_us() - c->congested_last_us) / 1024; + if (i < 0) + return 0; + + i += atomic_read(&c->congested); + if (i >= 0) + return 0; + + i += CONGESTED_MAX; + + if (i > 0) + i = fract_exp_two(i, 6); + + i -= hweight32(get_random_u32()); + + return i > 0 ? i : 1; +} + +static void add_sequential(struct task_struct *t) +{ + ewma_add(t->sequential_io_avg, + t->sequential_io, 8, 0); + + t->sequential_io = 0; +} + +static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) +{ + return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; +} + +bool bch_check_should_bypass(struct cached_dev *dc, struct bio *bio) +{ + unsigned int mode = cache_mode(dc); + unsigned int sectors, congested, dirty_percentage, block_size; + struct task_struct *task = current; + struct io *i; + + if (dc->disk.c) { + dirty_percentage = dc->disk.c->gc_stats.in_use; + block_size = dc->disk.c->sb.block_size; + } else { + /* XXX bcache2: */ + dirty_percentage = 0; + block_size = 0; + //block_size = dc->disk.c2->sb.block_size; + } + + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || + dirty_percentage > CUTOFF_CACHE_ADD || + (bio_op(bio) == REQ_OP_DISCARD)) + goto skip; + + if (mode == CACHE_MODE_NONE || + (mode == CACHE_MODE_WRITEAROUND && + op_is_write(bio_op(bio)))) + goto skip; + + /* + * If the bio is for read-ahead or background IO, bypass it or + * not depends on the following situations, + * - If the IO is for meta data, always cache it and no bypass + * - If the IO is not meta data, check dc->cache_reada_policy, + * BCH_CACHE_READA_ALL: cache it and not bypass + * BCH_CACHE_READA_META_ONLY: not cache it and bypass + * That is, read-ahead request for metadata always get cached + * (eg, for gfs2 or xfs). + */ + if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) { + if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) && + (dc->cache_readahead_policy != BCH_CACHE_READA_ALL)) + goto skip; + } + + if (bio->bi_iter.bi_sector & (block_size - 1) || + bio_sectors(bio) & (block_size - 1)) { + pr_debug("skipping unaligned io"); + goto skip; + } + + if (bypass_torture_test(dc)) { + if ((get_random_int() & 3) == 3) + goto skip; + else + goto rescale; + } + + if (dc->disk.c) { + congested = bch_get_congested(dc->disk.c); + } else { + /* XXX bcache2: */ + congested = 0; + } + + if (!congested && !dc->sequential_cutoff) + goto rescale; + + spin_lock(&dc->io_lock); + + hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) + if (i->last == bio->bi_iter.bi_sector && + time_before(jiffies, i->jiffies)) + goto found; + + i = list_first_entry(&dc->io_lru, struct io, lru); + + add_sequential(task); + i->sequential = 0; +found: + if (i->sequential + bio->bi_iter.bi_size > i->sequential) + i->sequential += bio->bi_iter.bi_size; + + i->last = bio_end_sector(bio); + i->jiffies = jiffies + msecs_to_jiffies(5000); + task->sequential_io = i->sequential; + + hlist_del(&i->hash); + hlist_add_head(&i->hash, iohash(dc, i->last)); + list_move_tail(&i->lru, &dc->io_lru); + + spin_unlock(&dc->io_lock); + + sectors = max(task->sequential_io, + task->sequential_io_avg) >> 9; + + if (dc->sequential_cutoff && + sectors >= dc->sequential_cutoff >> 9) { + trace_bcache_bypass_sequential(bio); + goto skip; + } + + if (congested && sectors >= congested) { + trace_bcache_bypass_congested(bio); + goto skip; + } + +rescale: + if (dc->disk.c) { + bch_rescale_priorities(dc->disk.c, bio_sectors(bio)); + } else { + /* bcache2: */ + } + return false; +skip: + if (dc->disk.c) { + bch_mark_sectors_bypassed(dc->disk.c, dc, bio_sectors(bio)); + } else { + /* bcache2: */ + } + return true; +} /* Bios with headers */ diff --git a/drivers/md/bcache/io.h b/drivers/md/bcache/io.h new file mode 100644 index 000000000000..59bc40c8b107 --- /dev/null +++ b/drivers/md/bcache/io.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_IO_H_ +#define _BCACHE_IO_H_ + +struct cached_dev; +struct cache; +struct cache_set; + +unsigned int bch_get_congested(const struct cache_set *c); +bool bch_check_should_bypass(struct cached_dev *dc, struct bio *bio); + +void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); +void bch_count_io_errors(struct cache *ca, blk_status_t error, + int is_read, const char *m); +void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, + blk_status_t error, const char *m); +void bch_bbio_endio(struct cache_set *c, struct bio *bio, + blk_status_t error, const char *m); +void bch_bbio_free(struct bio *bio, struct cache_set *c); +struct bio *bch_bbio_alloc(struct cache_set *c); + +void __bch_submit_bbio(struct bio *bio, struct cache_set *c); +void bch_submit_bbio(struct bio *bio, struct cache_set *c, + struct bkey *k, unsigned int ptr); + +#endif /* _BCACHE_IO_H_ */ diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 7891fb512736..69e0f128a4f4 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -9,6 +9,7 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "io.h" #include <trace/events/bcache.h> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 8b85ad8b3f45..755bc6a448cb 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -10,28 +10,21 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "io.h" #include "request.h" +#include "request2.h" #include "writeback.h" #include <linux/module.h> #include <linux/hash.h> -#include <linux/random.h> #include <linux/backing-dev.h> #include <trace/events/bcache.h> -#define CUTOFF_CACHE_ADD 95 -#define CUTOFF_CACHE_READA 90 - struct kmem_cache *bch_search_cache; static void bch_data_insert_start(struct closure *cl); -static unsigned int cache_mode(struct cached_dev *dc) -{ - return BDEV_CACHE_MODE(&dc->sb); -} - static bool verify(struct cached_dev *dc) { return dc->verify; @@ -317,147 +310,6 @@ void bch_data_insert(struct closure *cl) bch_data_insert_start(cl); } -/* - * Congested? Return 0 (not congested) or the limit (in sectors) - * beyond which we should bypass the cache due to congestion. - */ -unsigned int bch_get_congested(const struct cache_set *c) -{ - int i; - - if (!c->congested_read_threshold_us && - !c->congested_write_threshold_us) - return 0; - - i = (local_clock_us() - c->congested_last_us) / 1024; - if (i < 0) - return 0; - - i += atomic_read(&c->congested); - if (i >= 0) - return 0; - - i += CONGESTED_MAX; - - if (i > 0) - i = fract_exp_two(i, 6); - - i -= hweight32(get_random_u32()); - - return i > 0 ? i : 1; -} - -static void add_sequential(struct task_struct *t) -{ - ewma_add(t->sequential_io_avg, - t->sequential_io, 8, 0); - - t->sequential_io = 0; -} - -static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) -{ - return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; -} - -static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) -{ - struct cache_set *c = dc->disk.c; - unsigned int mode = cache_mode(dc); - unsigned int sectors, congested; - struct task_struct *task = current; - struct io *i; - - if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - c->gc_stats.in_use > CUTOFF_CACHE_ADD || - (bio_op(bio) == REQ_OP_DISCARD)) - goto skip; - - if (mode == CACHE_MODE_NONE || - (mode == CACHE_MODE_WRITEAROUND && - op_is_write(bio_op(bio)))) - goto skip; - - /* - * If the bio is for read-ahead or background IO, bypass it or - * not depends on the following situations, - * - If the IO is for meta data, always cache it and no bypass - * - If the IO is not meta data, check dc->cache_reada_policy, - * BCH_CACHE_READA_ALL: cache it and not bypass - * BCH_CACHE_READA_META_ONLY: not cache it and bypass - * That is, read-ahead request for metadata always get cached - * (eg, for gfs2 or xfs). - */ - if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) { - if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) && - (dc->cache_readahead_policy != BCH_CACHE_READA_ALL)) - goto skip; - } - - if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || - bio_sectors(bio) & (c->sb.block_size - 1)) { - pr_debug("skipping unaligned io"); - goto skip; - } - - if (bypass_torture_test(dc)) { - if ((get_random_int() & 3) == 3) - goto skip; - else - goto rescale; - } - - congested = bch_get_congested(c); - if (!congested && !dc->sequential_cutoff) - goto rescale; - - spin_lock(&dc->io_lock); - - hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) - if (i->last == bio->bi_iter.bi_sector && - time_before(jiffies, i->jiffies)) - goto found; - - i = list_first_entry(&dc->io_lru, struct io, lru); - - add_sequential(task); - i->sequential = 0; -found: - if (i->sequential + bio->bi_iter.bi_size > i->sequential) - i->sequential += bio->bi_iter.bi_size; - - i->last = bio_end_sector(bio); - i->jiffies = jiffies + msecs_to_jiffies(5000); - task->sequential_io = i->sequential; - - hlist_del(&i->hash); - hlist_add_head(&i->hash, iohash(dc, i->last)); - list_move_tail(&i->lru, &dc->io_lru); - - spin_unlock(&dc->io_lock); - - sectors = max(task->sequential_io, - task->sequential_io_avg) >> 9; - - if (dc->sequential_cutoff && - sectors >= dc->sequential_cutoff >> 9) { - trace_bcache_bypass_sequential(bio); - goto skip; - } - - if (congested && sectors >= congested) { - trace_bcache_bypass_congested(bio); - goto skip; - } - -rescale: - bch_rescale_priorities(c, bio_sectors(bio)); - return false; -skip: - bch_mark_sectors_bypassed(c, dc, bio_sectors(bio)); - return true; -} - /* Cache lookup */ struct search { diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index bb005c93dd72..01a32c5d4ec4 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -33,7 +33,6 @@ struct data_insert_op { BKEY_PADDED(replace_key); }; -unsigned int bch_get_congested(const struct cache_set *c); void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 0da9b3af944c..f3f42862acc4 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -11,7 +11,9 @@ #include "btree.h" #include "debug.h" #include "extents.h" +#include "io.h" #include "request.h" +#include "super.h" #include "writeback.h" #include <linux/blkdev.h> diff --git a/drivers/md/bcache/super.h b/drivers/md/bcache/super.h new file mode 100644 index 000000000000..0ef3e83e8ff9 --- /dev/null +++ b/drivers/md/bcache/super.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_SUPER_H +#define _BCACHE_SUPER_H + +struct cached_dev; +struct cache; +struct cache_set; + +void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); +void bcache_write_super(struct cache_set *c); +int bch_uuid_write(struct cache_set *c); +int bch_prio_write(struct cache *ca, bool wait); + +void bcache_device_stop(struct bcache_device *d); +int bch_cached_dev_run(struct cached_dev *dc); +void bch_cached_dev_detach(struct cached_dev *dc); +int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint8_t *set_uuid); + +bool bch_cached_dev_error(struct cached_dev *dc); + +int bch_flash_dev_create(struct cache_set *c, uint64_t size); + +void bch_cached_dev_release(struct kobject *kobj); +void bch_flash_dev_release(struct kobject *kobj); +void bch_cache_set_release(struct kobject *kobj); +void bch_cache_release(struct kobject *kobj); + +void bch_cache_set_unregister(struct cache_set *c); +void bch_cache_set_stop(struct cache_set *c); + +struct cache_set *bch_cache_set_alloc(struct cache_sb *sb); + +#endif /* _BCACHE_SUPER_H */ diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 059ca573f2ab..34d09d94e95d 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -9,6 +9,7 @@ #include "bcache.h" #include "sysfs.h" #include "btree.h" +#include "io.h" #include "request.h" #include "writeback.h" diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 4e8b6f42e19b..5487291035b0 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -10,6 +10,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "io.h" #include "writeback.h" #include <linux/delay.h> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index f69bef938a0a..a83700acc4ce 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -42,44 +42,6 @@ struct bch_dirty_init_state { struct dirty_init_thrd_info infos[BCH_DIRTY_INIT_THRD_MAX]; }; -static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) -{ - uint64_t i, ret = 0; - - for (i = 0; i < d->nr_stripes; i++) - ret += atomic_read(d->stripe_sectors_dirty + i); - - return ret; -} - -static inline unsigned int offset_to_stripe(struct bcache_device *d, - uint64_t offset) -{ - do_div(offset, d->stripe_size); - return offset; -} - -static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, - uint64_t offset, - unsigned int nr_sectors) -{ - unsigned int stripe = offset_to_stripe(&dc->disk, offset); - - while (1) { - if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) - return true; - - if (nr_sectors <= dc->disk.stripe_size) - return false; - - nr_sectors -= dc->disk.stripe_size; - stripe++; - } -} - -extern unsigned int bch_cutoff_writeback; -extern unsigned int bch_cutoff_writeback_sync; - static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned int cache_mode, bool would_skip) { @@ -106,26 +68,6 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, in_use <= bch_cutoff_writeback); } -static inline void bch_writeback_queue(struct cached_dev *dc) -{ - if (!IS_ERR_OR_NULL(dc->writeback_thread)) - wake_up_process(dc->writeback_thread); -} - -static inline void bch_writeback_add(struct cached_dev *dc) -{ - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ - bch_write_bdev_super(dc, NULL); - } - - bch_writeback_queue(dc); - } -} - void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, uint64_t offset, int nr_sectors); diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 9a1965c6c3d0..ee308b5fc2b5 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -6,17 +6,7 @@ * Bcache on disk data structures */ -#include <linux/types.h> - -#define BITMASK(name, type, field, offset, size) \ -static inline __u64 name(const type *k) \ -{ return (k->field >> offset) & ~(~0ULL << size); } \ - \ -static inline void SET_##name(type *k, __u64 v) \ -{ \ - k->field &= ~(~(~0ULL << size) << offset); \ - k->field |= (v & ~(~0ULL << size)) << offset; \ -} +#include <linux/bcache_superblock.h> /* Btree keys - all units are in sectors */ @@ -135,165 +125,6 @@ static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys) /* Superblock */ -/* Version 0: Cache device - * Version 1: Backing device - * Version 2: Seed pointer into btree node checksum - * Version 3: Cache device with new UUID format - * Version 4: Backing device with data offset - */ -#define BCACHE_SB_VERSION_CDEV 0 -#define BCACHE_SB_VERSION_BDEV 1 -#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 -#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 -#define BCACHE_SB_MAX_VERSION 4 - -#define SB_SECTOR 8 -#define SB_OFFSET (SB_SECTOR << SECTOR_SHIFT) -#define SB_SIZE 4096 -#define SB_LABEL_SIZE 32 -#define SB_JOURNAL_BUCKETS 256U -/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ -#define MAX_CACHES_PER_SET 8 - -#define BDEV_DATA_START_DEFAULT 16 /* sectors */ - -struct cache_sb_disk { - __le64 csum; - __le64 offset; /* sector where this sb was written */ - __le64 version; - - __u8 magic[16]; - - __u8 uuid[16]; - union { - __u8 set_uuid[16]; - __le64 set_magic; - }; - __u8 label[SB_LABEL_SIZE]; - - __le64 flags; - __le64 seq; - __le64 pad[8]; - - union { - struct { - /* Cache devices */ - __le64 nbuckets; /* device size */ - - __le16 block_size; /* sectors */ - __le16 bucket_size; /* sectors */ - - __le16 nr_in_set; - __le16 nr_this_dev; - }; - struct { - /* Backing devices */ - __le64 data_offset; - - /* - * block_size from the cache device section is still used by - * backing devices, so don't add anything here until we fix - * things to not need it for backing devices anymore - */ - }; - }; - - __le32 last_mount; /* time overflow in y2106 */ - - __le16 first_bucket; - union { - __le16 njournal_buckets; - __le16 keys; - }; - __le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ -}; - -struct cache_sb { - __u64 csum; - __u64 offset; /* sector where this sb was written */ - __u64 version; - - __u8 magic[16]; - - __u8 uuid[16]; - union { - __u8 set_uuid[16]; - __u64 set_magic; - }; - __u8 label[SB_LABEL_SIZE]; - - __u64 flags; - __u64 seq; - __u64 pad[8]; - - union { - struct { - /* Cache devices */ - __u64 nbuckets; /* device size */ - - __u16 block_size; /* sectors */ - __u16 bucket_size; /* sectors */ - - __u16 nr_in_set; - __u16 nr_this_dev; - }; - struct { - /* Backing devices */ - __u64 data_offset; - - /* - * block_size from the cache device section is still used by - * backing devices, so don't add anything here until we fix - * things to not need it for backing devices anymore - */ - }; - }; - - __u32 last_mount; /* time overflow in y2106 */ - - __u16 first_bucket; - union { - __u16 njournal_buckets; - __u16 keys; - }; - __u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ -}; - -static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) -{ - return sb->version == BCACHE_SB_VERSION_BDEV - || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; -} - -BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); -BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); -BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); -#define CACHE_REPLACEMENT_LRU 0U -#define CACHE_REPLACEMENT_FIFO 1U -#define CACHE_REPLACEMENT_RANDOM 2U - -BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); -#define CACHE_MODE_WRITETHROUGH 0U -#define CACHE_MODE_WRITEBACK 1U -#define CACHE_MODE_WRITEAROUND 2U -#define CACHE_MODE_NONE 3U -BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); -#define BDEV_STATE_NONE 0U -#define BDEV_STATE_CLEAN 1U -#define BDEV_STATE_DIRTY 2U -#define BDEV_STATE_STALE 3U - -/* - * Magic numbers - * - * The various other data structures have their own magic numbers, which are - * xored with the first part of the cache set's UUID - */ - -#define JSET_MAGIC 0x245235c1a3625032ULL -#define PSET_MAGIC 0x6750e15f87337f91ULL -#define BSET_MAGIC 0x90135c78b99e07f5ULL - static inline __u64 jset_magic(struct cache_sb *sb) { return sb->set_magic ^ JSET_MAGIC; diff --git a/include/uapi/linux/bcache_superblock.h b/include/uapi/linux/bcache_superblock.h new file mode 100644 index 000000000000..a152111ac55f --- /dev/null +++ b/include/uapi/linux/bcache_superblock.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_BCACHE_SUPERBLOCK_H +#define _LINUX_BCACHE_SUPERBLOCK_H + +#include <linux/types.h> + +#define BITMASK(name, type, field, offset, size) \ +static inline __u64 name(const type *k) \ +{ return (k->field >> offset) & ~(~0ULL << size); } \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + k->field &= ~(~(~0ULL << size) << offset); \ + k->field |= (v & ~(~0ULL << size)) << offset; \ +} + +/* Version 0: Cache device + * Version 1: Backing device + * Version 2: Seed pointer into btree node checksum + * Version 3: Cache device with new UUID format + * Version 4: Backing device with data offset + */ +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_MAX_VERSION 4 + +#define SB_SECTOR 8 +#define SB_OFFSET (SB_SECTOR << SECTOR_SHIFT) +#define SB_SIZE 4096 +#define SB_LABEL_SIZE 32 +#define SB_JOURNAL_BUCKETS 256U +/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ +#define MAX_CACHES_PER_SET 8 + +#define BDEV_DATA_START_DEFAULT 16 /* sectors */ + +struct cache_sb_disk { + __le64 csum; + __le64 offset; /* sector where this sb was written */ + __le64 version; + + __u8 magic[16]; + + __u8 uuid[16]; + union { + __u8 set_uuid[16]; + __le64 set_magic; + }; + __u8 label[SB_LABEL_SIZE]; + + __le64 flags; + __le64 seq; + __le64 pad[8]; + + union { + struct { + /* Cache devices */ + __le64 nbuckets; /* device size */ + + __le16 block_size; /* sectors */ + __le16 bucket_size; /* sectors */ + + __le16 nr_in_set; + __le16 nr_this_dev; + }; + struct { + /* Backing devices */ + __le64 data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + __le32 last_mount; /* time overflow in y2106 */ + + __le16 first_bucket; + union { + __le16 njournal_buckets; + __le16 keys; + }; + __le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ +}; + +struct cache_sb { + __u64 csum; + __u64 offset; /* sector where this sb was written */ + __u64 version; + + __u8 magic[16]; + + __u8 uuid[16]; + union { + __u8 set_uuid[16]; + __u64 set_magic; + }; + __u8 label[SB_LABEL_SIZE]; + + __u64 flags; + __u64 seq; + __u64 pad[8]; + + union { + struct { + /* Cache devices */ + __u64 nbuckets; /* device size */ + + __u16 block_size; /* sectors */ + __u16 bucket_size; /* sectors */ + + __u16 nr_in_set; + __u16 nr_this_dev; + }; + struct { + /* Backing devices */ + __u64 data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + __u32 last_mount; /* time overflow in y2106 */ + + __u16 first_bucket; + union { + __u16 njournal_buckets; + __u16 keys; + }; + __u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ +}; + +static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) +{ + return sb->version == BCACHE_SB_VERSION_BDEV + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; +} + +BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); +BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); +BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); +#define CACHE_REPLACEMENT_LRU 0U +#define CACHE_REPLACEMENT_FIFO 1U +#define CACHE_REPLACEMENT_RANDOM 2U +#define CACHE_REPLACEMENT_NR 3U + +BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); +#define CACHE_MODE_WRITETHROUGH 0U +#define CACHE_MODE_WRITEBACK 1U +#define CACHE_MODE_WRITEAROUND 2U +#define CACHE_MODE_NONE 3U +BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); +#define BDEV_STATE_NONE 0U +#define BDEV_STATE_CLEAN 1U +#define BDEV_STATE_DIRTY 2U +#define BDEV_STATE_STALE 3U + +/* + * Magic numbers + * + * The various other data structures have their own magic numbers, which are + * xored with the first part of the cache set's UUID + */ + +#define JSET_MAGIC 0x245235c1a3625032ULL +#define PSET_MAGIC 0x6750e15f87337f91ULL +#define BSET_MAGIC 0x90135c78b99e07f5ULL + +#endif /* _LINUX_BCACHE_SUPERBLOCK_H */ |