summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2015-04-11 00:48:41 -0700
committerKent Overstreet <kent.overstreet@gmail.com>2016-10-07 12:33:52 -0800
commit81c307487a9a133a61cb20f44860a151dd78dbee (patch)
tree8a11101a2158c2112a917c933f17b7f3ea8ccd17
parent346195d01ddd18f99e18e7502a849ed88c0534b9 (diff)
bcache: General purpose IO clocks
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--drivers/md/bcache/Makefile2
-rw-r--r--drivers/md/bcache/alloc.c38
-rw-r--r--drivers/md/bcache/alloc.h25
-rw-r--r--drivers/md/bcache/alloc_types.h29
-rw-r--r--drivers/md/bcache/bcache.h11
-rw-r--r--drivers/md/bcache/clock.c74
-rw-r--r--drivers/md/bcache/clock.h10
-rw-r--r--drivers/md/bcache/clock_types.h32
-rw-r--r--drivers/md/bcache/io.c1
-rw-r--r--drivers/md/bcache/request.c2
-rw-r--r--drivers/md/bcache/super.c13
11 files changed, 186 insertions, 51 deletions
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 453bfd1cb466..02ef2612777e 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -2,7 +2,7 @@
obj-$(CONFIG_BCACHE) += bcache.o
bcache-y := alloc.o bkey.o bkey_methods.o blockdev.o bset.o\
- btree.o buckets.o closure.o debug.o extents.o gc.o inode.o io.o\
+ btree.o buckets.o clock.o closure.o debug.o extents.o gc.o inode.o io.o\
journal.o keybuf.o keylist.o migrate.o move.o movinggc.o notify.o\
request.o six.o stats.o super.o sysfs.o tier.o trace.o util.o writeback.o
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index d50ca2205ec0..b79745991532 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -57,6 +57,7 @@
#include "alloc.h"
#include "btree.h"
#include "buckets.h"
+#include "clock.h"
#include "extents.h"
#include "io.h"
#include "journal.h"
@@ -502,18 +503,12 @@ static void bch_rescale_prios(struct cache_set *c, int rw)
}
}
-void bch_increment_clock_slowpath(struct cache_set *c, int rw)
+static void bch_inc_clock_hand(struct io_timer *timer)
{
- struct prio_clock *clock = &c->prio_clock[rw];
- long next = c->capacity >> 10;
- long old, v = atomic_long_read(&clock->rescale);
-
- do {
- old = v;
- if (old >= 0)
- return;
- } while ((v = atomic_long_cmpxchg(&clock->rescale,
- old, old + next)) != old);
+ struct prio_clock *clock = container_of(timer,
+ struct prio_clock, rescale);
+ struct cache_set *c = container_of(clock,
+ struct cache_set, prio_clock[clock->rw]);
mutex_lock(&c->bucket_lock);
@@ -521,9 +516,28 @@ void bch_increment_clock_slowpath(struct cache_set *c, int rw)
/* if clock cannot be advanced more, rescale prio */
if (clock->hand == (u16) (clock->min_prio - 1))
- bch_rescale_prios(c, rw);
+ bch_rescale_prios(c, clock->rw);
mutex_unlock(&c->bucket_lock);
+
+ /*
+ * we only increment when 0.1% of the cache_set has been read
+ * or written too, this determines if it's time
+ */
+ timer->expire += c->capacity >> 10;
+
+ bch_io_timer_add(&c->io_clock[clock->rw], timer);
+}
+
+void bch_prio_timer_start(struct cache_set *c, int rw)
+{
+ struct prio_clock *clock = &c->prio_clock[rw];
+ struct io_timer *timer = &clock->rescale;
+
+ clock->rw = rw;
+ timer->fn = bch_inc_clock_hand;
+ timer->expire = c->capacity >> 10;
+ bch_io_timer_add(&c->io_clock[rw], timer);
}
/*
diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h
index b962c5745d74..b2fc9d504561 100644
--- a/drivers/md/bcache/alloc.h
+++ b/drivers/md/bcache/alloc.h
@@ -15,30 +15,7 @@ void bch_cache_group_add_cache(struct cache_group *, struct cache *);
int bch_prio_read(struct cache *);
void bch_recalc_min_prio(struct cache *, int);
-void bch_increment_clock_slowpath(struct cache_set *, int);
-
-static inline void bch_increment_clock(struct cache_set *c,
- unsigned sectors, int rw)
-{
- struct prio_clock *clock = &c->prio_clock[rw];
-
- /* Buffer up one megabyte worth of IO in the percpu counter */
- preempt_disable();
- if (this_cpu_add_return(*clock->rescale_percpu, sectors) < 2048) {
- preempt_enable();
- return;
- }
-
- sectors = this_cpu_xchg(*clock->rescale_percpu, 0);
- preempt_enable();
-
- /*
- * we only increment when 0.1% of the cache_set has been read
- * or written too, this determines if it's time
- */
- if (atomic_long_sub_return(sectors, &clock->rescale) < 0)
- bch_increment_clock_slowpath(c, rw);
-}
+void bch_prio_timer_start(struct cache_set *, int);
void __bch_bucket_free(struct cache *, struct bucket *);
void bch_bucket_free(struct cache_set *, struct bkey_i *);
diff --git a/drivers/md/bcache/alloc_types.h b/drivers/md/bcache/alloc_types.h
index ae852620b37a..47f7a6585c8a 100644
--- a/drivers/md/bcache/alloc_types.h
+++ b/drivers/md/bcache/alloc_types.h
@@ -1,6 +1,35 @@
#ifndef _BCACHE_ALLOC_TYPES_H
#define _BCACHE_ALLOC_TYPES_H
+#include "clock_types.h"
+
+/*
+ * There's two of these clocks, one for reads and one for writes:
+ *
+ * All fields protected by bucket_lock
+ */
+struct prio_clock {
+ /*
+ * "now" in (read/write) IO time - incremented whenever we do X amount
+ * of reads or writes.
+ *
+ * Goes with the bucket read/write prios: when we read or write to a
+ * bucket we reset the bucket's prio to the current hand; thus hand -
+ * prio = time since bucket was last read/written.
+ *
+ * The units are some amount (bytes/sectors) of data read/written, and
+ * the units can change on the fly if we need to rescale to fit
+ * everything in a u16 - your only guarantee is that the units are
+ * consistent.
+ */
+ u16 hand;
+ u16 min_prio;
+
+ int rw;
+
+ struct io_timer rescale;
+};
+
/* There is one reserve for each type of btree, one for prios and gens
* and one for moving GC */
enum alloc_reserve {
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 6b6a2887bc8c..b7791384bdcf 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -212,6 +212,7 @@
#include "bkey_methods.h"
#include "blockdev_types.h"
#include "buckets_types.h"
+#include "clock_types.h"
#include "journal_types.h"
#include "keylist_types.h"
#include "keybuf_types.h"
@@ -385,14 +386,6 @@ enum {
CACHE_SET_GC_FAILURE,
};
-struct prio_clock {
- /* All fields protected by bucket_lock */
- u16 hand;
- u16 min_prio;
- atomic_long_t rescale;
- unsigned __percpu *rescale_percpu;
-};
-
struct cache_member_rcu {
struct rcu_head rcu;
unsigned nr_in_set;
@@ -509,6 +502,8 @@ struct cache_set {
*/
struct prio_clock prio_clock[2];
+ struct io_clock io_clock[2];
+
/* SECTOR ALLOCATOR */
struct list_head open_buckets_open;
struct list_head open_buckets_free;
diff --git a/drivers/md/bcache/clock.c b/drivers/md/bcache/clock.c
new file mode 100644
index 000000000000..bfc068f278db
--- /dev/null
+++ b/drivers/md/bcache/clock.c
@@ -0,0 +1,74 @@
+#include "bcache.h"
+#include "clock.h"
+
+static inline bool io_timer_cmp(struct io_timer *l, struct io_timer *r)
+{
+ return time_after(l->expire, r->expire);
+}
+
+void bch_io_timer_add(struct io_clock *clock, struct io_timer *timer)
+{
+ spin_lock(&clock->timer_lock);
+ BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp));
+ spin_unlock(&clock->timer_lock);
+}
+
+static struct io_timer *get_expired_timer(struct io_clock *clock,
+ unsigned long now)
+{
+ struct io_timer *ret = NULL;
+
+ spin_lock(&clock->timer_lock);
+
+ if (clock->timers.used &&
+ time_after_eq(now, clock->timers.data[0]->expire))
+ heap_pop(&clock->timers, ret, io_timer_cmp);
+
+ spin_unlock(&clock->timer_lock);
+
+ return ret;
+}
+
+void bch_increment_clock(struct cache_set *c, unsigned sectors, int rw)
+{
+ struct io_clock *clock = &c->io_clock[rw];
+ struct io_timer *timer;
+ unsigned long now;
+
+ /* Buffer up one megabyte worth of IO in the percpu counter */
+ preempt_disable();
+
+ if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
+ IO_CLOCK_PCPU_SECTORS)) {
+ preempt_enable();
+ return;
+ }
+
+ sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
+ preempt_enable();
+ now = atomic_long_add_return(sectors, &clock->now);
+
+ while ((timer = get_expired_timer(clock, now)))
+ timer->fn(timer);
+}
+
+void bch_io_clock_exit(struct io_clock *clock)
+{
+ free_heap(&clock->timers);
+ free_percpu(clock->pcpu_buf);
+}
+
+int bch_io_clock_init(struct io_clock *clock)
+{
+ atomic_long_set(&clock->now, 0);
+ spin_lock_init(&clock->timer_lock);
+
+ clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
+ if (!clock->pcpu_buf)
+ return -ENOMEM;
+
+ if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/drivers/md/bcache/clock.h b/drivers/md/bcache/clock.h
new file mode 100644
index 000000000000..37d5f20174a5
--- /dev/null
+++ b/drivers/md/bcache/clock.h
@@ -0,0 +1,10 @@
+#ifndef _BCACHE_CLOCK_H
+#define _BCACHE_CLOCK_H
+
+void bch_io_timer_add(struct io_clock *, struct io_timer *);
+void bch_increment_clock(struct cache_set *, unsigned, int);
+
+void bch_io_clock_exit(struct io_clock *);
+int bch_io_clock_init(struct io_clock *);
+
+#endif /* _BCACHE_CLOCK_H */
diff --git a/drivers/md/bcache/clock_types.h b/drivers/md/bcache/clock_types.h
new file mode 100644
index 000000000000..346466a9f987
--- /dev/null
+++ b/drivers/md/bcache/clock_types.h
@@ -0,0 +1,32 @@
+#ifndef _BCACHE_CLOCK_TYPES_H
+#define _BCACHE_CLOCK_TYPES_H
+
+#define NR_IO_TIMERS 8
+
+/*
+ * Clocks/timers in units of sectors of IO:
+ *
+ * Note - they use percpu batching, so they're only approximate.
+ */
+
+struct io_timer;
+typedef void (*io_timer_fn)(struct io_timer *);
+
+struct io_timer {
+ io_timer_fn fn;
+ unsigned long expire;
+};
+
+/* Amount to buffer up on a percpu counter */
+#define IO_CLOCK_PCPU_SECTORS 128
+
+struct io_clock {
+ atomic_long_t now;
+ u16 __percpu *pcpu_buf;
+
+ spinlock_t timer_lock;
+ DECLARE_HEAP(struct io_timer *, timers);
+};
+
+#endif /* _BCACHE_CLOCK_TYPES_H */
+
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index af801502d5d6..9b638562a912 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -10,6 +10,7 @@
#include "bset.h"
#include "btree.h"
#include "buckets.h"
+#include "clock.h"
#include "debug.h"
#include "extents.h"
#include "gc.h"
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index f35d91fd627f..01febc9f5154 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -25,8 +25,8 @@
#include "bcache.h"
#include "blockdev.h"
-#include "alloc.h"
#include "btree.h"
+#include "clock.h"
#include "debug.h"
#include "extents.h"
#include "io.h"
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f0429e8bb402..94af71c14745 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -10,6 +10,7 @@
#include "blockdev.h"
#include "alloc.h"
#include "btree.h"
+#include "clock.h"
#include "debug.h"
#include "gc.h"
#include "inode.h"
@@ -766,8 +767,8 @@ static void cache_set_free(struct closure *cl)
bch_bset_sort_state_free(&c->sort);
percpu_ref_exit(&c->writes);
- free_percpu(c->prio_clock[WRITE].rescale_percpu);
- free_percpu(c->prio_clock[READ].rescale_percpu);
+ bch_io_clock_exit(&c->io_clock[WRITE]);
+ bch_io_clock_exit(&c->io_clock[READ]);
if (c->wq)
destroy_workqueue(c->wq);
if (c->bio_split)
@@ -998,9 +999,8 @@ static const char *bch_cache_set_alloc(struct cache_sb *sb,
!(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
!(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
!(c->wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
- !(c->prio_clock[READ].rescale_percpu = alloc_percpu(unsigned)) ||
- !(c->prio_clock[WRITE].rescale_percpu = alloc_percpu(unsigned)) ||
- percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) ||
+ bch_io_clock_init(&c->io_clock[READ]) ||
+ bch_io_clock_init(&c->io_clock[WRITE]) ||
bch_journal_alloc(c) ||
bch_btree_cache_alloc(c) ||
bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
@@ -1173,6 +1173,9 @@ static const char *run_cache_set(struct cache_set *c)
bch_journal_meta(c, &cl);
}
+ bch_prio_timer_start(c, READ);
+ bch_prio_timer_start(c, WRITE);
+
err = "error starting btree GC thread";
if (bch_gc_thread_start(c))
goto err;