bcache: Make allocation code more intelligent

- rework bch_bucket_alloc_set so that it doesn't fail unless it can't allocate from any devices - split out the policy crap - "foreground writes try to allocate from tier 0 first" - into a separate wrapper function - make btree allocations also just prefer tier 0, instead of _only_ allocating from tier 0
author: Kent Overstreet <kent.overstreet@gmail.com> 2016-10-02 19:17:57 -0800
committer: Kent Overstreet <kent.overstreet@gmail.com> 2016-10-07 12:37:16 -0800
commit: 6fd7c5b18b42f4adeea8bad4f5fba741d65609f2 (patch)
tree: b3d9d7cceec0bc7485d0bf39d9a94feb9dc2dc4e
parent: 0722e2548e971455ac51d5b470ccc0d7e2ca62c1 (diff)
3 files changed, 83 insertions, 93 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 422b6aacd749..6a37bb9c2850 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -920,32 +920,17 @@ static void __bch_bucket_free(struct cache *ca, struct bucket *g)
 
 enum bucket_alloc_ret {
 	ALLOC_SUCCESS,
-	CACHE_SET_FULL,		/* -ENOSPC */
+	NO_DEVICES,		/* -EROFS */
 	FREELIST_EMPTY,		/* Allocator thread not keeping up */
 };
 
-static struct cache *bch_next_cache(struct cache_set *c,
-				    enum alloc_reserve reserve,
-				    struct cache_group *devs,
-				    long *cache_used)
+static void recalc_alloc_group_weights(struct cache_set *c,
+				       struct cache_group *devs)
 {
 	struct cache *ca;
-	unsigned i, weight;
 	u64 available_buckets = 0;
+	unsigned i;
 
-	spin_lock(&devs->lock);
-
-	if (devs->nr_devices == 0)
-		goto err;
-
-	if (devs->nr_devices == 1) {
-		ca = devs->d[0].dev;
-		if (test_bit(ca->sb.nr_this_dev, cache_used))
-			goto err;
-		goto out;
-	}
-
-	/* recalculate weightings: XXX don't do this on every call */
 	for (i = 0; i < devs->nr_devices; i++) {
 		ca = devs->d[i].dev;
 
@@ -965,35 +950,9 @@ static struct cache *bch_next_cache(struct cache_set *c,
 				  available_buckets);
 		devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
 	}
-
-	for (i = 0; i < devs->nr_devices; i++)
-		if (!test_bit(devs->d[i].dev->sb.nr_this_dev, cache_used))
-			goto available;
-
-	/* no unused devices: */
-	goto err;
-available:
-	i = devs->cur_device;
-	do {
-		weight	= devs->d[i].weight;
-		ca	= devs->d[i].dev;
-		i++;
-		i %= devs->nr_devices;
-	} while (test_bit(ca->sb.nr_this_dev, cache_used) ||
-		 get_random_int() > weight);
-	devs->cur_device = i;
-out:
-	spin_unlock(&devs->lock);
-	return ca;
-err:
-	spin_unlock(&devs->lock);
-	return NULL;
 }
 
-/*
- * XXX: change the alloc_group to explicitly round robin across devices
- */
-static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
+static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
 						    struct open_bucket *ob,
 						    enum alloc_reserve reserve,
 						    unsigned nr_replicas,
@@ -1001,38 +960,52 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
 						    long *caches_used)
 {
 	enum bucket_alloc_ret ret;
+	unsigned fail_idx = -1, i;
 
-	BUG_ON(nr_replicas > BCH_REPLICAS_MAX);
-
-	if (!devs->nr_devices)
-		return CACHE_SET_FULL;
+	if (ob->nr_ptrs >= nr_replicas)
+		return ALLOC_SUCCESS;
 
 	rcu_read_lock();
+	spin_lock(&devs->lock);
+
+	for (i = 0; i < devs->nr_devices; i++)
+		if (!test_bit(devs->d[i].dev->sb.nr_this_dev, caches_used))
+			goto available;
+
+	/* no unused devices: */
+	ret = NO_DEVICES;
+	goto err;
 
-	/* sort by free space/prio of oldest data in caches */
+available:
+	recalc_alloc_group_weights(c, devs);
+
+	i = devs->cur_device;
 
 	while (ob->nr_ptrs < nr_replicas) {
-		struct cache_group *d;
 		struct cache *ca;
-		size_t r;
+		u64 bucket;
 
-		/* first ptr goes to the specified tier, the rest to any */
-
-		d = (!ob->nr_ptrs && devs == &c->cache_all &&
-		     c->cache_tiers[0].nr_devices)
-			? &c->cache_tiers[0]
-			: devs;
+		i++;
+		i %= devs->nr_devices;
 
-		ca = bch_next_cache(c, reserve, d, caches_used);
-		if (!ca) {
-			ret = CACHE_SET_FULL;
+		ret = FREELIST_EMPTY;
+		if (i == fail_idx)
 			goto err;
-		}
 
-		r = bch_bucket_alloc(ca, reserve);
-		if (!r) {
-			ret = FREELIST_EMPTY;
-			goto err;
+		ca = devs->d[i].dev;
+
+		if (test_bit(ca->sb.nr_this_dev, caches_used))
+			continue;
+
+		if (fail_idx == -1 &&
+		    get_random_int() > devs->d[i].weight)
+			continue;
+
+		bucket = bch_bucket_alloc(ca, reserve);
+		if (!bucket) {
+			if (fail_idx == -1)
+				fail_idx = i;
+			continue;
 		}
 
 		/*
@@ -1044,46 +1017,69 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
 			ob->nr_ptrs * sizeof(ob->ptrs[0]));
 		ob->nr_ptrs++;
 		ob->ptrs[0] = (struct bch_extent_ptr) {
-			.gen	= ca->bucket_gens[r],
-			.offset	= bucket_to_sector(ca, r),
+			.gen	= ca->bucket_gens[bucket],
+			.offset	= bucket_to_sector(ca, bucket),
 			.dev	= ca->sb.nr_this_dev,
 		};
 
 		__set_bit(ca->sb.nr_this_dev, caches_used);
+		devs->cur_device = i;
 	}
 
-	rcu_read_unlock();
-	return ALLOC_SUCCESS;
+	ret = ALLOC_SUCCESS;
 err:
+	spin_unlock(&devs->lock);
 	rcu_read_unlock();
 	return ret;
 }
 
-static int bch_bucket_alloc_set(struct cache_set *c, struct open_bucket *ob,
-				enum alloc_reserve reserve,
-				unsigned nr_replicas,
-				struct cache_group *devs, long *caches_used,
-				struct closure *cl)
+static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
+						    struct write_point *wp,
+						    struct open_bucket *ob,
+						    unsigned nr_replicas,
+						    long *caches_used)
+{
+	/*
+	 * this should implement policy - for a given type of allocation, decide
+	 * which devices to allocate from:
+	 *
+	 * XXX: switch off wp->type and do something more intelligent here
+	 */
+
+	/* foreground writes: prefer tier 0: */
+	if (wp->group == &c->cache_all)
+		bch_bucket_alloc_group(c, ob, wp->reserve, nr_replicas,
+				       &c->cache_tiers[0], caches_used);
+
+	return bch_bucket_alloc_group(c, ob, wp->reserve, nr_replicas,
+				      wp->group, caches_used);
+}
+
+static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp,
+				struct open_bucket *ob, unsigned nr_replicas,
+				long *caches_used, struct closure *cl)
 {
 	bool waiting = false;
 
 	while (1) {
-		switch (__bch_bucket_alloc_set(c, ob, reserve, nr_replicas,
-					       devs, caches_used)) {
+		switch (__bch_bucket_alloc_set(c, wp, ob, nr_replicas,
+					       caches_used)) {
 		case ALLOC_SUCCESS:
 			if (waiting)
 				closure_wake_up(&c->freelist_wait);
 
 			return 0;
 
-		case CACHE_SET_FULL:
-			trace_bcache_cache_set_full(c, reserve, cl);
-
+		case NO_DEVICES:
 			if (waiting)
 				closure_wake_up(&c->freelist_wait);
-			return -ENOSPC;
+			return -EROFS;
 
 		case FREELIST_EMPTY:
+			if (!cl || waiting)
+				trace_bcache_freelist_empty_fail(c,
+							wp->reserve, cl);
+
 			if (!cl)
 				return -ENOSPC;
 
@@ -1311,8 +1307,8 @@ static int open_bucket_add_buckets(struct cache_set *c,
 		}
 	}
 
-	return bch_bucket_alloc_set(c, ob, wp->reserve, nr_replicas,
-				    wp->group, caches_used, cl);
+	return bch_bucket_alloc_set(c, wp, ob, nr_replicas,
+				    caches_used, cl);
 }
 
 /*
@@ -1819,7 +1815,7 @@ void bch_open_buckets_init(struct cache_set *c)
 	c->migration_write_point.group = &c->cache_all;
 	c->migration_write_point.reserve = RESERVE_NONE;
 
-	c->btree_write_point.group = &c->cache_tiers[0];
+	c->btree_write_point.group = &c->cache_all;
 	c->btree_write_point.reserve = RESERVE_BTREE;
 
 	c->pd_controllers_update_seconds = 5;
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 3c6402c477f8..c1ca30f7ca2a 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -575,7 +575,7 @@ static void __bch_write(struct closure *cl)
 
 		if (unlikely(IS_ERR(b))) {
 			if (unlikely(PTR_ERR(b) != -EAGAIN)) {
-				ret = -EROFS;
+				ret = PTR_ERR(b);
 				goto err;
 			}
 
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 3a710a91125f..f4153a5c962d 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -962,13 +962,7 @@ DECLARE_EVENT_CLASS(cache_set_bucket_alloc,
 		  __entry->cl)
 );
 
-DEFINE_EVENT(cache_set_bucket_alloc, bcache_buckets_unavailable_fail,
-	TP_PROTO(struct cache_set *c, enum alloc_reserve reserve,
-		 struct closure *cl),
-	TP_ARGS(c, reserve, cl)
-);
-
-DEFINE_EVENT(cache_set_bucket_alloc, bcache_cache_set_full,
+DEFINE_EVENT(cache_set_bucket_alloc, bcache_freelist_empty_fail,
 	TP_PROTO(struct cache_set *c, enum alloc_reserve reserve,
 		 struct closure *cl),
 	TP_ARGS(c, reserve, cl)
author	Kent Overstreet <kent.overstreet@gmail.com>	2016-10-02 19:17:57 -0800
committer	Kent Overstreet <kent.overstreet@gmail.com>	2016-10-07 12:37:16 -0800
commit	6fd7c5b18b42f4adeea8bad4f5fba741d65609f2 (patch)
tree	b3d9d7cceec0bc7485d0bf39d9a94feb9dc2dc4e
parent	0722e2548e971455ac51d5b470ccc0d7e2ca62c1 (diff)