summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-10-02 19:17:57 -0800
committerKent Overstreet <kent.overstreet@gmail.com>2016-10-07 12:37:16 -0800
commit6fd7c5b18b42f4adeea8bad4f5fba741d65609f2 (patch)
treeb3d9d7cceec0bc7485d0bf39d9a94feb9dc2dc4e
parent0722e2548e971455ac51d5b470ccc0d7e2ca62c1 (diff)
bcache: Make allocation code more intelligent
- rework bch_bucket_alloc_set so that it doesn't fail unless it can't allocate from any devices - split out the policy crap - "foreground writes try to allocate from tier 0 first" - into a separate wrapper function - make btree allocations also just prefer tier 0, instead of _only_ allocating from tier 0
-rw-r--r--drivers/md/bcache/alloc.c166
-rw-r--r--drivers/md/bcache/io.c2
-rw-r--r--include/trace/events/bcache.h8
3 files changed, 83 insertions, 93 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 422b6aacd749..6a37bb9c2850 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -920,32 +920,17 @@ static void __bch_bucket_free(struct cache *ca, struct bucket *g)
enum bucket_alloc_ret {
ALLOC_SUCCESS,
- CACHE_SET_FULL, /* -ENOSPC */
+ NO_DEVICES, /* -EROFS */
FREELIST_EMPTY, /* Allocator thread not keeping up */
};
-static struct cache *bch_next_cache(struct cache_set *c,
- enum alloc_reserve reserve,
- struct cache_group *devs,
- long *cache_used)
+static void recalc_alloc_group_weights(struct cache_set *c,
+ struct cache_group *devs)
{
struct cache *ca;
- unsigned i, weight;
u64 available_buckets = 0;
+ unsigned i;
- spin_lock(&devs->lock);
-
- if (devs->nr_devices == 0)
- goto err;
-
- if (devs->nr_devices == 1) {
- ca = devs->d[0].dev;
- if (test_bit(ca->sb.nr_this_dev, cache_used))
- goto err;
- goto out;
- }
-
- /* recalculate weightings: XXX don't do this on every call */
for (i = 0; i < devs->nr_devices; i++) {
ca = devs->d[i].dev;
@@ -965,35 +950,9 @@ static struct cache *bch_next_cache(struct cache_set *c,
available_buckets);
devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
}
-
- for (i = 0; i < devs->nr_devices; i++)
- if (!test_bit(devs->d[i].dev->sb.nr_this_dev, cache_used))
- goto available;
-
- /* no unused devices: */
- goto err;
-available:
- i = devs->cur_device;
- do {
- weight = devs->d[i].weight;
- ca = devs->d[i].dev;
- i++;
- i %= devs->nr_devices;
- } while (test_bit(ca->sb.nr_this_dev, cache_used) ||
- get_random_int() > weight);
- devs->cur_device = i;
-out:
- spin_unlock(&devs->lock);
- return ca;
-err:
- spin_unlock(&devs->lock);
- return NULL;
}
-/*
- * XXX: change the alloc_group to explicitly round robin across devices
- */
-static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
+static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
struct open_bucket *ob,
enum alloc_reserve reserve,
unsigned nr_replicas,
@@ -1001,38 +960,52 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
long *caches_used)
{
enum bucket_alloc_ret ret;
+ unsigned fail_idx = -1, i;
- BUG_ON(nr_replicas > BCH_REPLICAS_MAX);
-
- if (!devs->nr_devices)
- return CACHE_SET_FULL;
+ if (ob->nr_ptrs >= nr_replicas)
+ return ALLOC_SUCCESS;
rcu_read_lock();
+ spin_lock(&devs->lock);
+
+ for (i = 0; i < devs->nr_devices; i++)
+ if (!test_bit(devs->d[i].dev->sb.nr_this_dev, caches_used))
+ goto available;
+
+ /* no unused devices: */
+ ret = NO_DEVICES;
+ goto err;
- /* sort by free space/prio of oldest data in caches */
+available:
+ recalc_alloc_group_weights(c, devs);
+
+ i = devs->cur_device;
while (ob->nr_ptrs < nr_replicas) {
- struct cache_group *d;
struct cache *ca;
- size_t r;
+ u64 bucket;
- /* first ptr goes to the specified tier, the rest to any */
-
- d = (!ob->nr_ptrs && devs == &c->cache_all &&
- c->cache_tiers[0].nr_devices)
- ? &c->cache_tiers[0]
- : devs;
+ i++;
+ i %= devs->nr_devices;
- ca = bch_next_cache(c, reserve, d, caches_used);
- if (!ca) {
- ret = CACHE_SET_FULL;
+ ret = FREELIST_EMPTY;
+ if (i == fail_idx)
goto err;
- }
- r = bch_bucket_alloc(ca, reserve);
- if (!r) {
- ret = FREELIST_EMPTY;
- goto err;
+ ca = devs->d[i].dev;
+
+ if (test_bit(ca->sb.nr_this_dev, caches_used))
+ continue;
+
+ if (fail_idx == -1 &&
+ get_random_int() > devs->d[i].weight)
+ continue;
+
+ bucket = bch_bucket_alloc(ca, reserve);
+ if (!bucket) {
+ if (fail_idx == -1)
+ fail_idx = i;
+ continue;
}
/*
@@ -1044,46 +1017,69 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
ob->nr_ptrs * sizeof(ob->ptrs[0]));
ob->nr_ptrs++;
ob->ptrs[0] = (struct bch_extent_ptr) {
- .gen = ca->bucket_gens[r],
- .offset = bucket_to_sector(ca, r),
+ .gen = ca->bucket_gens[bucket],
+ .offset = bucket_to_sector(ca, bucket),
.dev = ca->sb.nr_this_dev,
};
__set_bit(ca->sb.nr_this_dev, caches_used);
+ devs->cur_device = i;
}
- rcu_read_unlock();
- return ALLOC_SUCCESS;
+ ret = ALLOC_SUCCESS;
err:
+ spin_unlock(&devs->lock);
rcu_read_unlock();
return ret;
}
-static int bch_bucket_alloc_set(struct cache_set *c, struct open_bucket *ob,
- enum alloc_reserve reserve,
- unsigned nr_replicas,
- struct cache_group *devs, long *caches_used,
- struct closure *cl)
+static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
+ struct write_point *wp,
+ struct open_bucket *ob,
+ unsigned nr_replicas,
+ long *caches_used)
+{
+ /*
+ * this should implement policy - for a given type of allocation, decide
+ * which devices to allocate from:
+ *
+ * XXX: switch off wp->type and do something more intelligent here
+ */
+
+ /* foreground writes: prefer tier 0: */
+ if (wp->group == &c->cache_all)
+ bch_bucket_alloc_group(c, ob, wp->reserve, nr_replicas,
+ &c->cache_tiers[0], caches_used);
+
+ return bch_bucket_alloc_group(c, ob, wp->reserve, nr_replicas,
+ wp->group, caches_used);
+}
+
+static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp,
+ struct open_bucket *ob, unsigned nr_replicas,
+ long *caches_used, struct closure *cl)
{
bool waiting = false;
while (1) {
- switch (__bch_bucket_alloc_set(c, ob, reserve, nr_replicas,
- devs, caches_used)) {
+ switch (__bch_bucket_alloc_set(c, wp, ob, nr_replicas,
+ caches_used)) {
case ALLOC_SUCCESS:
if (waiting)
closure_wake_up(&c->freelist_wait);
return 0;
- case CACHE_SET_FULL:
- trace_bcache_cache_set_full(c, reserve, cl);
-
+ case NO_DEVICES:
if (waiting)
closure_wake_up(&c->freelist_wait);
- return -ENOSPC;
+ return -EROFS;
case FREELIST_EMPTY:
+ if (!cl || waiting)
+ trace_bcache_freelist_empty_fail(c,
+ wp->reserve, cl);
+
if (!cl)
return -ENOSPC;
@@ -1311,8 +1307,8 @@ static int open_bucket_add_buckets(struct cache_set *c,
}
}
- return bch_bucket_alloc_set(c, ob, wp->reserve, nr_replicas,
- wp->group, caches_used, cl);
+ return bch_bucket_alloc_set(c, wp, ob, nr_replicas,
+ caches_used, cl);
}
/*
@@ -1819,7 +1815,7 @@ void bch_open_buckets_init(struct cache_set *c)
c->migration_write_point.group = &c->cache_all;
c->migration_write_point.reserve = RESERVE_NONE;
- c->btree_write_point.group = &c->cache_tiers[0];
+ c->btree_write_point.group = &c->cache_all;
c->btree_write_point.reserve = RESERVE_BTREE;
c->pd_controllers_update_seconds = 5;
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 3c6402c477f8..c1ca30f7ca2a 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -575,7 +575,7 @@ static void __bch_write(struct closure *cl)
if (unlikely(IS_ERR(b))) {
if (unlikely(PTR_ERR(b) != -EAGAIN)) {
- ret = -EROFS;
+ ret = PTR_ERR(b);
goto err;
}
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 3a710a91125f..f4153a5c962d 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -962,13 +962,7 @@ DECLARE_EVENT_CLASS(cache_set_bucket_alloc,
__entry->cl)
);
-DEFINE_EVENT(cache_set_bucket_alloc, bcache_buckets_unavailable_fail,
- TP_PROTO(struct cache_set *c, enum alloc_reserve reserve,
- struct closure *cl),
- TP_ARGS(c, reserve, cl)
-);
-
-DEFINE_EVENT(cache_set_bucket_alloc, bcache_cache_set_full,
+DEFINE_EVENT(cache_set_bucket_alloc, bcache_freelist_empty_fail,
TP_PROTO(struct cache_set *c, enum alloc_reserve reserve,
struct closure *cl),
TP_ARGS(c, reserve, cl)