diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2015-04-11 00:48:41 -0700 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2016-10-07 12:33:52 -0800 |
commit | 81c307487a9a133a61cb20f44860a151dd78dbee (patch) | |
tree | 8a11101a2158c2112a917c933f17b7f3ea8ccd17 | |
parent | 346195d01ddd18f99e18e7502a849ed88c0534b9 (diff) |
bcache: General purpose IO clocks
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r-- | drivers/md/bcache/Makefile | 2 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.c | 38 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.h | 25 | ||||
-rw-r--r-- | drivers/md/bcache/alloc_types.h | 29 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 11 | ||||
-rw-r--r-- | drivers/md/bcache/clock.c | 74 | ||||
-rw-r--r-- | drivers/md/bcache/clock.h | 10 | ||||
-rw-r--r-- | drivers/md/bcache/clock_types.h | 32 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 13 |
11 files changed, 186 insertions, 51 deletions
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 453bfd1cb466..02ef2612777e 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := alloc.o bkey.o bkey_methods.o blockdev.o bset.o\ - btree.o buckets.o closure.o debug.o extents.o gc.o inode.o io.o\ + btree.o buckets.o clock.o closure.o debug.o extents.o gc.o inode.o io.o\ journal.o keybuf.o keylist.o migrate.o move.o movinggc.o notify.o\ request.o six.o stats.o super.o sysfs.o tier.o trace.o util.o writeback.o diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index d50ca2205ec0..b79745991532 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -57,6 +57,7 @@ #include "alloc.h" #include "btree.h" #include "buckets.h" +#include "clock.h" #include "extents.h" #include "io.h" #include "journal.h" @@ -502,18 +503,12 @@ static void bch_rescale_prios(struct cache_set *c, int rw) } } -void bch_increment_clock_slowpath(struct cache_set *c, int rw) +static void bch_inc_clock_hand(struct io_timer *timer) { - struct prio_clock *clock = &c->prio_clock[rw]; - long next = c->capacity >> 10; - long old, v = atomic_long_read(&clock->rescale); - - do { - old = v; - if (old >= 0) - return; - } while ((v = atomic_long_cmpxchg(&clock->rescale, - old, old + next)) != old); + struct prio_clock *clock = container_of(timer, + struct prio_clock, rescale); + struct cache_set *c = container_of(clock, + struct cache_set, prio_clock[clock->rw]); mutex_lock(&c->bucket_lock); @@ -521,9 +516,28 @@ void bch_increment_clock_slowpath(struct cache_set *c, int rw) /* if clock cannot be advanced more, rescale prio */ if (clock->hand == (u16) (clock->min_prio - 1)) - bch_rescale_prios(c, rw); + bch_rescale_prios(c, clock->rw); mutex_unlock(&c->bucket_lock); + + /* + * we only increment when 0.1% of the cache_set has been read + * or written too, this determines if it's time + */ + timer->expire += c->capacity >> 10; + + bch_io_timer_add(&c->io_clock[clock->rw], timer); +} + +void bch_prio_timer_start(struct cache_set *c, int rw) +{ + struct prio_clock *clock = &c->prio_clock[rw]; + struct io_timer *timer = &clock->rescale; + + clock->rw = rw; + timer->fn = bch_inc_clock_hand; + timer->expire = c->capacity >> 10; + bch_io_timer_add(&c->io_clock[rw], timer); } /* diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h index b962c5745d74..b2fc9d504561 100644 --- a/drivers/md/bcache/alloc.h +++ b/drivers/md/bcache/alloc.h @@ -15,30 +15,7 @@ void bch_cache_group_add_cache(struct cache_group *, struct cache *); int bch_prio_read(struct cache *); void bch_recalc_min_prio(struct cache *, int); -void bch_increment_clock_slowpath(struct cache_set *, int); - -static inline void bch_increment_clock(struct cache_set *c, - unsigned sectors, int rw) -{ - struct prio_clock *clock = &c->prio_clock[rw]; - - /* Buffer up one megabyte worth of IO in the percpu counter */ - preempt_disable(); - if (this_cpu_add_return(*clock->rescale_percpu, sectors) < 2048) { - preempt_enable(); - return; - } - - sectors = this_cpu_xchg(*clock->rescale_percpu, 0); - preempt_enable(); - - /* - * we only increment when 0.1% of the cache_set has been read - * or written too, this determines if it's time - */ - if (atomic_long_sub_return(sectors, &clock->rescale) < 0) - bch_increment_clock_slowpath(c, rw); -} +void bch_prio_timer_start(struct cache_set *, int); void __bch_bucket_free(struct cache *, struct bucket *); void bch_bucket_free(struct cache_set *, struct bkey_i *); diff --git a/drivers/md/bcache/alloc_types.h b/drivers/md/bcache/alloc_types.h index ae852620b37a..47f7a6585c8a 100644 --- a/drivers/md/bcache/alloc_types.h +++ b/drivers/md/bcache/alloc_types.h @@ -1,6 +1,35 @@ #ifndef _BCACHE_ALLOC_TYPES_H #define _BCACHE_ALLOC_TYPES_H +#include "clock_types.h" + +/* + * There's two of these clocks, one for reads and one for writes: + * + * All fields protected by bucket_lock + */ +struct prio_clock { + /* + * "now" in (read/write) IO time - incremented whenever we do X amount + * of reads or writes. + * + * Goes with the bucket read/write prios: when we read or write to a + * bucket we reset the bucket's prio to the current hand; thus hand - + * prio = time since bucket was last read/written. + * + * The units are some amount (bytes/sectors) of data read/written, and + * the units can change on the fly if we need to rescale to fit + * everything in a u16 - your only guarantee is that the units are + * consistent. + */ + u16 hand; + u16 min_prio; + + int rw; + + struct io_timer rescale; +}; + /* There is one reserve for each type of btree, one for prios and gens * and one for moving GC */ enum alloc_reserve { diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 6b6a2887bc8c..b7791384bdcf 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -212,6 +212,7 @@ #include "bkey_methods.h" #include "blockdev_types.h" #include "buckets_types.h" +#include "clock_types.h" #include "journal_types.h" #include "keylist_types.h" #include "keybuf_types.h" @@ -385,14 +386,6 @@ enum { CACHE_SET_GC_FAILURE, }; -struct prio_clock { - /* All fields protected by bucket_lock */ - u16 hand; - u16 min_prio; - atomic_long_t rescale; - unsigned __percpu *rescale_percpu; -}; - struct cache_member_rcu { struct rcu_head rcu; unsigned nr_in_set; @@ -509,6 +502,8 @@ struct cache_set { */ struct prio_clock prio_clock[2]; + struct io_clock io_clock[2]; + /* SECTOR ALLOCATOR */ struct list_head open_buckets_open; struct list_head open_buckets_free; diff --git a/drivers/md/bcache/clock.c b/drivers/md/bcache/clock.c new file mode 100644 index 000000000000..bfc068f278db --- /dev/null +++ b/drivers/md/bcache/clock.c @@ -0,0 +1,74 @@ +#include "bcache.h" +#include "clock.h" + +static inline bool io_timer_cmp(struct io_timer *l, struct io_timer *r) +{ + return time_after(l->expire, r->expire); +} + +void bch_io_timer_add(struct io_clock *clock, struct io_timer *timer) +{ + spin_lock(&clock->timer_lock); + BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp)); + spin_unlock(&clock->timer_lock); +} + +static struct io_timer *get_expired_timer(struct io_clock *clock, + unsigned long now) +{ + struct io_timer *ret = NULL; + + spin_lock(&clock->timer_lock); + + if (clock->timers.used && + time_after_eq(now, clock->timers.data[0]->expire)) + heap_pop(&clock->timers, ret, io_timer_cmp); + + spin_unlock(&clock->timer_lock); + + return ret; +} + +void bch_increment_clock(struct cache_set *c, unsigned sectors, int rw) +{ + struct io_clock *clock = &c->io_clock[rw]; + struct io_timer *timer; + unsigned long now; + + /* Buffer up one megabyte worth of IO in the percpu counter */ + preempt_disable(); + + if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) < + IO_CLOCK_PCPU_SECTORS)) { + preempt_enable(); + return; + } + + sectors = this_cpu_xchg(*clock->pcpu_buf, 0); + preempt_enable(); + now = atomic_long_add_return(sectors, &clock->now); + + while ((timer = get_expired_timer(clock, now))) + timer->fn(timer); +} + +void bch_io_clock_exit(struct io_clock *clock) +{ + free_heap(&clock->timers); + free_percpu(clock->pcpu_buf); +} + +int bch_io_clock_init(struct io_clock *clock) +{ + atomic_long_set(&clock->now, 0); + spin_lock_init(&clock->timer_lock); + + clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); + if (!clock->pcpu_buf) + return -ENOMEM; + + if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) + return -ENOMEM; + + return 0; +} diff --git a/drivers/md/bcache/clock.h b/drivers/md/bcache/clock.h new file mode 100644 index 000000000000..37d5f20174a5 --- /dev/null +++ b/drivers/md/bcache/clock.h @@ -0,0 +1,10 @@ +#ifndef _BCACHE_CLOCK_H +#define _BCACHE_CLOCK_H + +void bch_io_timer_add(struct io_clock *, struct io_timer *); +void bch_increment_clock(struct cache_set *, unsigned, int); + +void bch_io_clock_exit(struct io_clock *); +int bch_io_clock_init(struct io_clock *); + +#endif /* _BCACHE_CLOCK_H */ diff --git a/drivers/md/bcache/clock_types.h b/drivers/md/bcache/clock_types.h new file mode 100644 index 000000000000..346466a9f987 --- /dev/null +++ b/drivers/md/bcache/clock_types.h @@ -0,0 +1,32 @@ +#ifndef _BCACHE_CLOCK_TYPES_H +#define _BCACHE_CLOCK_TYPES_H + +#define NR_IO_TIMERS 8 + +/* + * Clocks/timers in units of sectors of IO: + * + * Note - they use percpu batching, so they're only approximate. + */ + +struct io_timer; +typedef void (*io_timer_fn)(struct io_timer *); + +struct io_timer { + io_timer_fn fn; + unsigned long expire; +}; + +/* Amount to buffer up on a percpu counter */ +#define IO_CLOCK_PCPU_SECTORS 128 + +struct io_clock { + atomic_long_t now; + u16 __percpu *pcpu_buf; + + spinlock_t timer_lock; + DECLARE_HEAP(struct io_timer *, timers); +}; + +#endif /* _BCACHE_CLOCK_TYPES_H */ + diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index af801502d5d6..9b638562a912 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -10,6 +10,7 @@ #include "bset.h" #include "btree.h" #include "buckets.h" +#include "clock.h" #include "debug.h" #include "extents.h" #include "gc.h" diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index f35d91fd627f..01febc9f5154 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -25,8 +25,8 @@ #include "bcache.h" #include "blockdev.h" -#include "alloc.h" #include "btree.h" +#include "clock.h" #include "debug.h" #include "extents.h" #include "io.h" diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f0429e8bb402..94af71c14745 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -10,6 +10,7 @@ #include "blockdev.h" #include "alloc.h" #include "btree.h" +#include "clock.h" #include "debug.h" #include "gc.h" #include "inode.h" @@ -766,8 +767,8 @@ static void cache_set_free(struct closure *cl) bch_bset_sort_state_free(&c->sort); percpu_ref_exit(&c->writes); - free_percpu(c->prio_clock[WRITE].rescale_percpu); - free_percpu(c->prio_clock[READ].rescale_percpu); + bch_io_clock_exit(&c->io_clock[WRITE]); + bch_io_clock_exit(&c->io_clock[READ]); if (c->wq) destroy_workqueue(c->wq); if (c->bio_split) @@ -998,9 +999,8 @@ static const char *bch_cache_set_alloc(struct cache_sb *sb, !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(c->wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || - !(c->prio_clock[READ].rescale_percpu = alloc_percpu(unsigned)) || - !(c->prio_clock[WRITE].rescale_percpu = alloc_percpu(unsigned)) || - percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) || + bch_io_clock_init(&c->io_clock[READ]) || + bch_io_clock_init(&c->io_clock[WRITE]) || bch_journal_alloc(c) || bch_btree_cache_alloc(c) || bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) @@ -1173,6 +1173,9 @@ static const char *run_cache_set(struct cache_set *c) bch_journal_meta(c, &cl); } + bch_prio_timer_start(c, READ); + bch_prio_timer_start(c, WRITE); + err = "error starting btree GC thread"; if (bch_gc_thread_start(c)) goto err; |