rework allocator round robiningbcache-dev-wip

author: Kent Overstreet <kent.overstreet@gmail.com> 2016-02-20 04:27:56 -0900
committer: Kent Overstreet <kent.overstreet@gmail.com> 2016-08-28 19:16:16 -0800
commit: aae4a1ff28578bbb1346deedfffd5152b5b2a49f (patch)
tree: 3a0bcc0e78f65dff676e18952f2cbc5a530c89a2
parent: 8257a6962f0ce690672fc228d220c82126a54f35 (diff)
7 files changed, 116 insertions, 143 deletions
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index 3485019c535a..67de0d05535f 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -80,10 +80,10 @@ static void bch_cache_group_remove_cache(struct cache_group *grp, struct cache *
 {
 	unsigned i;
 
-	write_seqcount_begin(&grp->lock);
+	mutex_lock(&grp->lock);
 
 	for (i = 0; i < grp->nr_devices; i++)
-		if (rcu_access_pointer(grp->devices[i]) == ca) {
+		if (grp->devices[i].dev == ca) {
 			grp->nr_devices--;
 			memmove(&grp->devices[i],
 				&grp->devices[i + 1],
@@ -91,16 +91,40 @@ static void bch_cache_group_remove_cache(struct cache_group *grp, struct cache *
 			break;
 		}
 
-	write_seqcount_end(&grp->lock);
+	grp->next_alloc %= grp->nr_devices;
+
+	mutex_unlock(&grp->lock);
 }
 
-static void bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca)
+__must_check
+static int bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca)
 {
-	write_seqcount_begin(&grp->lock);
-	BUG_ON(grp->nr_devices >= MAX_CACHES_PER_SET);
+	mutex_lock(&grp->lock);
+
+	if (grp->nr_devices == grp->nr_devices_max) {
+		struct cache_group_entry *new_devices;
+		size_t new_max = grp->nr_devices_max
+			? grp->nr_devices_max * 2
+			: 8;
+
+		new_devices = krealloc(grp->devices,
+				       new_max * sizeof(grp->devices[0]),
+				       GFP_KERNEL);
+		if (!new_devices) {
+			mutex_unlock(&grp->lock);
+			return -ENOMEM;
+		}
+
+		grp->devices = new_devices;
+	}
 
-	rcu_assign_pointer(grp->devices[grp->nr_devices++], ca);
-	write_seqcount_end(&grp->lock);
+	grp->devices[grp->nr_devices++] = (struct cache_group_entry) {
+		.dev = ca,
+	};
+
+	mutex_unlock(&grp->lock);
+
+	return 0;
 }
 
 /* Ratelimiting/PD controllers */
@@ -124,9 +148,9 @@ static void pd_controllers_update(struct work_struct *work)
 	memset(tier_free, 0, sizeof(tier_free));
 	memset(tier_dirty, 0, sizeof(tier_dirty));
 
-	rcu_read_lock();
-	for (i = CACHE_TIERS - 1; i >= 0; --i)
-		group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
+	for (i = CACHE_TIERS - 1; i >= 0; --i) {
+		mutex_lock(&c->cache_tiers[i].lock);
+		group_for_each_cache(ca, &c->cache_tiers[i], iter) {
 			struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
 			unsigned bucket_bits = ca->bucket_bits + 9;
 
@@ -159,7 +183,8 @@ static void pd_controllers_update(struct work_struct *work)
 			tier_free[i] += free;
 			tier_dirty[i] += stats.buckets_dirty << bucket_bits;
 		}
-	rcu_read_unlock();
+		mutex_unlock(&c->cache_tiers[i].lock);
+	}
 
 	if (tier_size[1]) {
 		u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
@@ -917,64 +942,11 @@ static void __bch_bucket_free(struct cache *ca, struct bucket *g)
 
 enum bucket_alloc_ret {
 	ALLOC_SUCCESS,
-	CACHE_SET_FULL,		/* -ENOSPC */
+	CACHE_SET_FULL,		/* No devices to allocate from */
 	BUCKETS_NOT_AVAILABLE,	/* Device full */
 	FREELIST_EMPTY,		/* Allocator thread not keeping up */
 };
 
-static struct cache *bch_next_cache(struct cache_set *c,
-				    enum alloc_reserve reserve,
-				    struct cache_group *devs,
-				    long *cache_used)
-{
-	struct cache *ca;
-	size_t bucket_count = 0, rand;
-	unsigned i;
-
-	/*
-	 * first ptr allocation will always go to the specified tier,
-	 * 2nd and greater can go to any. If one tier is significantly larger
-	 * it is likely to go that tier.
-	 */
-
-	group_for_each_cache_rcu(ca, devs, i) {
-		if (test_bit(ca->sb.nr_this_dev, cache_used))
-			continue;
-
-		bucket_count += buckets_free_cache(ca, reserve);
-	}
-
-	if (!bucket_count)
-		return ERR_PTR(-BUCKETS_NOT_AVAILABLE);
-
-	/*
-	 * We create a weighted selection by using the number of free buckets
-	 * in each cache. You can think of this like lining up the caches
-	 * linearly so each as a given range, corresponding to the number of
-	 * free buckets in that cache, and then randomly picking a number
-	 * within that range.
-	 */
-
-	rand = bch_rand_range(bucket_count);
-
-	group_for_each_cache_rcu(ca, devs, i) {
-		if (test_bit(ca->sb.nr_this_dev, cache_used))
-			continue;
-
-		bucket_count -= buckets_free_cache(ca, reserve);
-
-		if (rand >= bucket_count)
-			return ca;
-	}
-
-	/*
-	 * If we fall off the end, it means we raced because of bucket counters
-	 * changing - return NULL so __bch_bucket_alloc_set() knows to retry
-	 */
-
-	return NULL;
-}
-
 /*
  * XXX: change the alloc_group to explicitly round robin across devices
  */
@@ -986,48 +958,45 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
 						    long *caches_used)
 {
 	enum bucket_alloc_ret ret;
+	struct cache_group_entry *i;
+	unsigned idx, start_idx;
+	u64 buckets_free_total = 0, buckets_free_fraction, r;
+	bool first_loop = true;
 
 	BUG_ON(nr_replicas > BCH_REPLICAS_MAX);
 
-	if (!devs->nr_devices)
+	mutex_lock(&devs->lock);
+
+	if (!devs->nr_devices) {
+		mutex_unlock(&devs->lock);
 		return CACHE_SET_FULL;
+	}
 
-	rcu_read_lock();
+	for (i = devs->devices;
+	     i < devs->devices + devs->nr_devices;
+	     i++) {
+		i->buckets_free = buckets_free_cache(i->dev, reserve);
+		buckets_free_total += i->buckets_free;
+	}
 
-	/* sort by free space/prio of oldest data in caches */
+	idx = start_idx = devs->next_alloc;
 
 	while (ob->nr_ptrs < nr_replicas) {
 		struct cache *ca;
-		unsigned seq;
-		size_t r;
 
-		/* first ptr goes to the specified tier, the rest to any */
-		do {
-			struct cache_group *d;
+		i = &devs->devices[idx];
+		ca = i->dev;
 
-			seq = read_seqcount_begin(&devs->lock);
+		if (test_bit(ca->sb.nr_this_dev, caches_used))
+			goto next;
 
-			d = (!ob->nr_ptrs && devs == &c->cache_all &&
-			     c->cache_tiers[0].nr_devices)
-				? &c->cache_tiers[0]
-				: devs;
+		buckets_free_fraction = (u64) i->buckets_free *
+			devs->nr_devices;
 
-			ca = devs->nr_devices
-				? bch_next_cache(c, reserve, d, caches_used)
-				: ERR_PTR(-CACHE_SET_FULL);
-
-			/*
-			 * If ca == NULL, we raced because of bucket counters
-			 * changing
-			 */
-		} while (read_seqcount_retry(&devs->lock, seq) || !ca);
-
-		if (IS_ERR(ca)) {
-			ret = -PTR_ERR(ca);
-			goto err;
-		}
-
-		__set_bit(ca->sb.nr_this_dev, caches_used);
+		if (first_loop &&
+		    (buckets_free_fraction < buckets_free_total &&
+		     buckets_free_fraction < bch_rand_range(buckets_free_total)))
+			goto next;
 
 		r = bch_bucket_alloc(ca, reserve);
 		if (!r) {
@@ -1035,6 +1004,8 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
 			goto err;
 		}
 
+		__set_bit(ca->sb.nr_this_dev, caches_used);
+
 		/*
 		 * open_bucket_add_buckets expects new pointers at the head of
 		 * the list:
@@ -1048,9 +1019,19 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
 			.offset	= bucket_to_sector(ca, r),
 			.dev	= ca->sb.nr_this_dev,
 		};
+next:
+		idx++;
+		idx %= devs->nr_devices;
+
+		if (idx == start_idx) {
+			if (!first_loop)
+				break;
+			first_loop = false;
+		}
 	}
 
-	rcu_read_unlock();
+	mutex_unlock(&devs->lock);
+
 	return ALLOC_SUCCESS;
 err:
 	rcu_read_unlock();
@@ -1785,7 +1766,10 @@ void bch_open_buckets_init(struct cache_set *c)
 		list_add(&c->open_buckets[i].list, &c->open_buckets_free);
 	}
 
-	seqcount_init(&c->cache_all.lock);
+	mutex_init(&c->cache_all.lock);
+
+	for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
+		mutex_init(&c->cache_tiers[i].lock);
 
 	for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
 		c->write_points[i].throttle = true;
@@ -1793,9 +1777,6 @@ void bch_open_buckets_init(struct cache_set *c)
 		c->write_points[i].group = &c->cache_tiers[0];
 	}
 
-	for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
-		seqcount_init(&c->cache_tiers[i].lock);
-
 	c->promote_write_point.group = &c->cache_tiers[0];
 	c->promote_write_point.reserve = RESERVE_NONE;
 
diff --git a/fs/bcachefs/bcache.h b/fs/bcachefs/bcache.h
index a6bbd38f4316..37237109299a 100644
--- a/fs/bcachefs/bcache.h
+++ b/fs/bcachefs/bcache.h
@@ -314,10 +314,17 @@ struct gc_pos {
 	unsigned		level;
 };
 
+struct cache_group_entry {
+	struct cache		*dev;
+	u64			buckets_free;
+};
+
 struct cache_group {
-	seqcount_t		lock;
+	struct mutex		lock;
 	unsigned		nr_devices;
-	struct cache __rcu	*devices[MAX_CACHES_PER_SET];
+	unsigned		nr_devices_max;
+	unsigned		next_alloc;
+	struct cache_group_entry *devices;
 };
 
 struct cache_member_cpu {
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 1b49c44f92c4..ad97bb834f8a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1537,6 +1537,7 @@ static void journal_reclaim_work(struct work_struct *work)
 	 * Advance last_idx to point to the oldest journal entry containing
 	 * btree node updates that have not yet been written out
 	 */
+	mutex_lock(&c->cache_tiers[0].lock);
 	group_for_each_cache(ca, &c->cache_tiers[0], iter) {
 		struct journal_device *ja = &ca->journal;
 
@@ -1584,6 +1585,7 @@ static void journal_reclaim_work(struct work_struct *work)
 				     ja->bucket_seq[bucket_to_flush]);
 		spin_unlock(&j->lock);
 	}
+	mutex_unlock(&c->cache_tiers[0].lock);
 
 	if (reclaim_lock_held)
 		mutex_unlock(&j->reclaim_lock);
@@ -1634,7 +1636,10 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 	 * Determine location of the next journal write:
 	 * XXX: sort caches by free journal space
 	 */
-	group_for_each_cache_rcu(ca, &c->cache_tiers[0], iter) {
+
+	/* XXX: use devices from other tiers if insufficient tier 0 devices */
+	mutex_lock(&c->cache_tiers[0].lock);
+	group_for_each_cache(ca, &c->cache_tiers[0], iter) {
 		struct journal_device *ja = &ca->journal;
 		unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
 
@@ -1664,6 +1669,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 
 		trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
 	}
+	mutex_unlock(&c->cache_tiers[0].lock);
 
 	rcu_read_unlock();
 	spin_unlock(&j->lock);
@@ -2264,7 +2270,8 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf)
 			 journal_entry_is_open(j),
 			 test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
 
-	group_for_each_cache_rcu(ca, &c->cache_tiers[0], iter) {
+	mutex_lock(&c->cache_tiers[0].lock);
+	group_for_each_cache(ca, &c->cache_tiers[0], iter) {
 		struct journal_device *ja = &ca->journal;
 
 		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
@@ -2276,6 +2283,7 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf)
 				 ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
 				 ja->last_idx,	ja->bucket_seq[ja->last_idx]);
 	}
+	mutex_unlock(&c->cache_tiers[0].lock);
 
 	spin_unlock(&j->lock);
 	rcu_read_unlock();
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index beb0587be4ce..1938bec2bbe1 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -146,7 +146,8 @@ static int bch_congested_fn(void *data, int bdi_bits)
 		}
 	} else {
 		/* Writes only go to tier 0: */
-		group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
+		mutex_lock(&c->cache_tiers[0].lock);
+		group_for_each_cache(ca, &c->cache_tiers[0], i) {
 			bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
 
 			if (bdi_congested(bdi, bdi_bits)) {
@@ -154,6 +155,7 @@ static int bch_congested_fn(void *data, int bdi_bits)
 				break;
 			}
 		}
+		mutex_unlock(&c->cache_tiers[0].lock);
 	}
 	rcu_read_unlock();
 
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 24f9b0ead9c8..8ec9fcffabdf 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -59,40 +59,11 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c,
 	     (ca = bch_get_next_cache(c, &(iter)));			\
 	     percpu_ref_put(&ca->ref), (iter)++)
 
-static inline struct cache *cache_group_next_rcu(struct cache_group *devs,
-						 unsigned *iter)
-{
-	struct cache *ret = NULL;
-
-	while (*iter < devs->nr_devices &&
-	       !(ret = rcu_dereference(devs->devices[*iter])))
-		(*iter)++;
-
-	return ret;
-}
-
-#define group_for_each_cache_rcu(ca, devs, iter)			\
-	for ((iter) = 0;						\
-	     ((ca) = cache_group_next_rcu((devs), &(iter)));		\
-	     (iter)++)
-
-static inline struct cache *cache_group_next(struct cache_group *devs,
-					     unsigned *iter)
-{
-	struct cache *ret;
-
-	rcu_read_lock();
-	if ((ret = cache_group_next_rcu(devs, iter)))
-		percpu_ref_get(&ret->ref);
-	rcu_read_unlock();
-
-	return ret;
-}
-
 #define group_for_each_cache(ca, devs, iter)				\
-	for ((iter) = 0;						\
-	     (ca = cache_group_next(devs, &(iter)));			\
-	     percpu_ref_put(&ca->ref), (iter)++)
+	for (({ lockdep_assert_held(&(devs)->lock); (iter) = 0; });	\
+	     (iter) < (devs)->nr_devices &&				\
+	     ((ca) = (devs)->devices[(iter)].dev, true);		\
+	     (iter)++)
 
 void bch_check_mark_super_slowpath(struct cache_set *,
 				   const struct bkey_i *, bool);
@@ -132,6 +103,7 @@ static inline bool bch_cache_may_remove(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
 	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	bool ret;
 
 	/*
 	 * Right now, we can't remove the last device from a tier,
@@ -150,8 +122,11 @@ static inline bool bch_cache_may_remove(struct cache *ca)
 	 * whole cache set RO.
 	 */
 
-	return tier->nr_devices != 1 ||
-		rcu_access_pointer(tier->devices[0]) != ca;
+	mutex_lock(&tier->lock);
+	ret = tier->nr_devices != 1 || tier->devices[0].dev != ca;
+	mutex_unlock(&tier->lock);
+
+	return ret;
 }
 
 void free_super(struct bcache_superblock *);
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 9a0d89d6223c..05a778c0f0c9 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -388,9 +388,9 @@ void bch_bio_free_pages(struct bio *bio)
 			__free_page(bv->bv_page);
 }
 
-size_t bch_rand_range(size_t max)
+u64 bch_rand_range(u64 max)
 {
-	size_t rand;
+	u64 rand;
 
 	do {
 		get_random_bytes(&rand, sizeof(rand));
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 97eea8892572..757a575e28bd 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -625,7 +625,7 @@ do {									\
 	_ret;								\
 })
 
-size_t bch_rand_range(size_t);
+u64 bch_rand_range(u64);
 
 void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
 void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
author	Kent Overstreet <kent.overstreet@gmail.com>	2016-02-20 04:27:56 -0900
committer	Kent Overstreet <kent.overstreet@gmail.com>	2016-08-28 19:16:16 -0800
commit	aae4a1ff28578bbb1346deedfffd5152b5b2a49f (patch)
tree	3a0bcc0e78f65dff676e18952f2cbc5a530c89a2
parent	8257a6962f0ce690672fc228d220c82126a54f35 (diff)