summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-02-20 04:27:56 -0900
committerKent Overstreet <kent.overstreet@gmail.com>2016-08-28 19:16:16 -0800
commitaae4a1ff28578bbb1346deedfffd5152b5b2a49f (patch)
tree3a0bcc0e78f65dff676e18952f2cbc5a530c89a2
parent8257a6962f0ce690672fc228d220c82126a54f35 (diff)
rework allocator round robiningbcache-dev-wip
-rw-r--r--fs/bcachefs/alloc.c181
-rw-r--r--fs/bcachefs/bcache.h11
-rw-r--r--fs/bcachefs/journal.c12
-rw-r--r--fs/bcachefs/super.c4
-rw-r--r--fs/bcachefs/super.h45
-rw-r--r--fs/bcachefs/util.c4
-rw-r--r--fs/bcachefs/util.h2
7 files changed, 116 insertions, 143 deletions
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index 3485019c535a..67de0d05535f 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -80,10 +80,10 @@ static void bch_cache_group_remove_cache(struct cache_group *grp, struct cache *
{
unsigned i;
- write_seqcount_begin(&grp->lock);
+ mutex_lock(&grp->lock);
for (i = 0; i < grp->nr_devices; i++)
- if (rcu_access_pointer(grp->devices[i]) == ca) {
+ if (grp->devices[i].dev == ca) {
grp->nr_devices--;
memmove(&grp->devices[i],
&grp->devices[i + 1],
@@ -91,16 +91,40 @@ static void bch_cache_group_remove_cache(struct cache_group *grp, struct cache *
break;
}
- write_seqcount_end(&grp->lock);
+ grp->next_alloc %= grp->nr_devices;
+
+ mutex_unlock(&grp->lock);
}
-static void bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca)
+__must_check
+static int bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca)
{
- write_seqcount_begin(&grp->lock);
- BUG_ON(grp->nr_devices >= MAX_CACHES_PER_SET);
+ mutex_lock(&grp->lock);
+
+ if (grp->nr_devices == grp->nr_devices_max) {
+ struct cache_group_entry *new_devices;
+ size_t new_max = grp->nr_devices_max
+ ? grp->nr_devices_max * 2
+ : 8;
+
+ new_devices = krealloc(grp->devices,
+ new_max * sizeof(grp->devices[0]),
+ GFP_KERNEL);
+ if (!new_devices) {
+ mutex_unlock(&grp->lock);
+ return -ENOMEM;
+ }
+
+ grp->devices = new_devices;
+ }
- rcu_assign_pointer(grp->devices[grp->nr_devices++], ca);
- write_seqcount_end(&grp->lock);
+ grp->devices[grp->nr_devices++] = (struct cache_group_entry) {
+ .dev = ca,
+ };
+
+ mutex_unlock(&grp->lock);
+
+ return 0;
}
/* Ratelimiting/PD controllers */
@@ -124,9 +148,9 @@ static void pd_controllers_update(struct work_struct *work)
memset(tier_free, 0, sizeof(tier_free));
memset(tier_dirty, 0, sizeof(tier_dirty));
- rcu_read_lock();
- for (i = CACHE_TIERS - 1; i >= 0; --i)
- group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
+ for (i = CACHE_TIERS - 1; i >= 0; --i) {
+ mutex_lock(&c->cache_tiers[i].lock);
+ group_for_each_cache(ca, &c->cache_tiers[i], iter) {
struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
unsigned bucket_bits = ca->bucket_bits + 9;
@@ -159,7 +183,8 @@ static void pd_controllers_update(struct work_struct *work)
tier_free[i] += free;
tier_dirty[i] += stats.buckets_dirty << bucket_bits;
}
- rcu_read_unlock();
+ mutex_unlock(&c->cache_tiers[i].lock);
+ }
if (tier_size[1]) {
u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
@@ -917,64 +942,11 @@ static void __bch_bucket_free(struct cache *ca, struct bucket *g)
enum bucket_alloc_ret {
ALLOC_SUCCESS,
- CACHE_SET_FULL, /* -ENOSPC */
+ CACHE_SET_FULL, /* No devices to allocate from */
BUCKETS_NOT_AVAILABLE, /* Device full */
FREELIST_EMPTY, /* Allocator thread not keeping up */
};
-static struct cache *bch_next_cache(struct cache_set *c,
- enum alloc_reserve reserve,
- struct cache_group *devs,
- long *cache_used)
-{
- struct cache *ca;
- size_t bucket_count = 0, rand;
- unsigned i;
-
- /*
- * first ptr allocation will always go to the specified tier,
- * 2nd and greater can go to any. If one tier is significantly larger
- * it is likely to go that tier.
- */
-
- group_for_each_cache_rcu(ca, devs, i) {
- if (test_bit(ca->sb.nr_this_dev, cache_used))
- continue;
-
- bucket_count += buckets_free_cache(ca, reserve);
- }
-
- if (!bucket_count)
- return ERR_PTR(-BUCKETS_NOT_AVAILABLE);
-
- /*
- * We create a weighted selection by using the number of free buckets
- * in each cache. You can think of this like lining up the caches
- * linearly so each as a given range, corresponding to the number of
- * free buckets in that cache, and then randomly picking a number
- * within that range.
- */
-
- rand = bch_rand_range(bucket_count);
-
- group_for_each_cache_rcu(ca, devs, i) {
- if (test_bit(ca->sb.nr_this_dev, cache_used))
- continue;
-
- bucket_count -= buckets_free_cache(ca, reserve);
-
- if (rand >= bucket_count)
- return ca;
- }
-
- /*
- * If we fall off the end, it means we raced because of bucket counters
- * changing - return NULL so __bch_bucket_alloc_set() knows to retry
- */
-
- return NULL;
-}
-
/*
* XXX: change the alloc_group to explicitly round robin across devices
*/
@@ -986,48 +958,45 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
long *caches_used)
{
enum bucket_alloc_ret ret;
+ struct cache_group_entry *i;
+ unsigned idx, start_idx;
+ u64 buckets_free_total = 0, buckets_free_fraction, r;
+ bool first_loop = true;
BUG_ON(nr_replicas > BCH_REPLICAS_MAX);
- if (!devs->nr_devices)
+ mutex_lock(&devs->lock);
+
+ if (!devs->nr_devices) {
+ mutex_unlock(&devs->lock);
return CACHE_SET_FULL;
+ }
- rcu_read_lock();
+ for (i = devs->devices;
+ i < devs->devices + devs->nr_devices;
+ i++) {
+ i->buckets_free = buckets_free_cache(i->dev, reserve);
+ buckets_free_total += i->buckets_free;
+ }
- /* sort by free space/prio of oldest data in caches */
+ idx = start_idx = devs->next_alloc;
while (ob->nr_ptrs < nr_replicas) {
struct cache *ca;
- unsigned seq;
- size_t r;
- /* first ptr goes to the specified tier, the rest to any */
- do {
- struct cache_group *d;
+ i = &devs->devices[idx];
+ ca = i->dev;
- seq = read_seqcount_begin(&devs->lock);
+ if (test_bit(ca->sb.nr_this_dev, caches_used))
+ goto next;
- d = (!ob->nr_ptrs && devs == &c->cache_all &&
- c->cache_tiers[0].nr_devices)
- ? &c->cache_tiers[0]
- : devs;
+ buckets_free_fraction = (u64) i->buckets_free *
+ devs->nr_devices;
- ca = devs->nr_devices
- ? bch_next_cache(c, reserve, d, caches_used)
- : ERR_PTR(-CACHE_SET_FULL);
-
- /*
- * If ca == NULL, we raced because of bucket counters
- * changing
- */
- } while (read_seqcount_retry(&devs->lock, seq) || !ca);
-
- if (IS_ERR(ca)) {
- ret = -PTR_ERR(ca);
- goto err;
- }
-
- __set_bit(ca->sb.nr_this_dev, caches_used);
+ if (first_loop &&
+ (buckets_free_fraction < buckets_free_total &&
+ buckets_free_fraction < bch_rand_range(buckets_free_total)))
+ goto next;
r = bch_bucket_alloc(ca, reserve);
if (!r) {
@@ -1035,6 +1004,8 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
goto err;
}
+ __set_bit(ca->sb.nr_this_dev, caches_used);
+
/*
* open_bucket_add_buckets expects new pointers at the head of
* the list:
@@ -1048,9 +1019,19 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
.offset = bucket_to_sector(ca, r),
.dev = ca->sb.nr_this_dev,
};
+next:
+ idx++;
+ idx %= devs->nr_devices;
+
+ if (idx == start_idx) {
+ if (!first_loop)
+ break;
+ first_loop = false;
+ }
}
- rcu_read_unlock();
+ mutex_unlock(&devs->lock);
+
return ALLOC_SUCCESS;
err:
rcu_read_unlock();
@@ -1785,7 +1766,10 @@ void bch_open_buckets_init(struct cache_set *c)
list_add(&c->open_buckets[i].list, &c->open_buckets_free);
}
- seqcount_init(&c->cache_all.lock);
+ mutex_init(&c->cache_all.lock);
+
+ for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
+ mutex_init(&c->cache_tiers[i].lock);
for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
c->write_points[i].throttle = true;
@@ -1793,9 +1777,6 @@ void bch_open_buckets_init(struct cache_set *c)
c->write_points[i].group = &c->cache_tiers[0];
}
- for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
- seqcount_init(&c->cache_tiers[i].lock);
-
c->promote_write_point.group = &c->cache_tiers[0];
c->promote_write_point.reserve = RESERVE_NONE;
diff --git a/fs/bcachefs/bcache.h b/fs/bcachefs/bcache.h
index a6bbd38f4316..37237109299a 100644
--- a/fs/bcachefs/bcache.h
+++ b/fs/bcachefs/bcache.h
@@ -314,10 +314,17 @@ struct gc_pos {
unsigned level;
};
+struct cache_group_entry {
+ struct cache *dev;
+ u64 buckets_free;
+};
+
struct cache_group {
- seqcount_t lock;
+ struct mutex lock;
unsigned nr_devices;
- struct cache __rcu *devices[MAX_CACHES_PER_SET];
+ unsigned nr_devices_max;
+ unsigned next_alloc;
+ struct cache_group_entry *devices;
};
struct cache_member_cpu {
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 1b49c44f92c4..ad97bb834f8a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1537,6 +1537,7 @@ static void journal_reclaim_work(struct work_struct *work)
* Advance last_idx to point to the oldest journal entry containing
* btree node updates that have not yet been written out
*/
+ mutex_lock(&c->cache_tiers[0].lock);
group_for_each_cache(ca, &c->cache_tiers[0], iter) {
struct journal_device *ja = &ca->journal;
@@ -1584,6 +1585,7 @@ static void journal_reclaim_work(struct work_struct *work)
ja->bucket_seq[bucket_to_flush]);
spin_unlock(&j->lock);
}
+ mutex_unlock(&c->cache_tiers[0].lock);
if (reclaim_lock_held)
mutex_unlock(&j->reclaim_lock);
@@ -1634,7 +1636,10 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
* Determine location of the next journal write:
* XXX: sort caches by free journal space
*/
- group_for_each_cache_rcu(ca, &c->cache_tiers[0], iter) {
+
+ /* XXX: use devices from other tiers if insufficient tier 0 devices */
+ mutex_lock(&c->cache_tiers[0].lock);
+ group_for_each_cache(ca, &c->cache_tiers[0], iter) {
struct journal_device *ja = &ca->journal;
unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
@@ -1664,6 +1669,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
}
+ mutex_unlock(&c->cache_tiers[0].lock);
rcu_read_unlock();
spin_unlock(&j->lock);
@@ -2264,7 +2270,8 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf)
journal_entry_is_open(j),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
- group_for_each_cache_rcu(ca, &c->cache_tiers[0], iter) {
+ mutex_lock(&c->cache_tiers[0].lock);
+ group_for_each_cache(ca, &c->cache_tiers[0], iter) {
struct journal_device *ja = &ca->journal;
ret += scnprintf(buf + ret, PAGE_SIZE - ret,
@@ -2276,6 +2283,7 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf)
ja->cur_idx, ja->bucket_seq[ja->cur_idx],
ja->last_idx, ja->bucket_seq[ja->last_idx]);
}
+ mutex_unlock(&c->cache_tiers[0].lock);
spin_unlock(&j->lock);
rcu_read_unlock();
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index beb0587be4ce..1938bec2bbe1 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -146,7 +146,8 @@ static int bch_congested_fn(void *data, int bdi_bits)
}
} else {
/* Writes only go to tier 0: */
- group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
+ mutex_lock(&c->cache_tiers[0].lock);
+ group_for_each_cache(ca, &c->cache_tiers[0], i) {
bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
if (bdi_congested(bdi, bdi_bits)) {
@@ -154,6 +155,7 @@ static int bch_congested_fn(void *data, int bdi_bits)
break;
}
}
+ mutex_unlock(&c->cache_tiers[0].lock);
}
rcu_read_unlock();
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 24f9b0ead9c8..8ec9fcffabdf 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -59,40 +59,11 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c,
(ca = bch_get_next_cache(c, &(iter))); \
percpu_ref_put(&ca->ref), (iter)++)
-static inline struct cache *cache_group_next_rcu(struct cache_group *devs,
- unsigned *iter)
-{
- struct cache *ret = NULL;
-
- while (*iter < devs->nr_devices &&
- !(ret = rcu_dereference(devs->devices[*iter])))
- (*iter)++;
-
- return ret;
-}
-
-#define group_for_each_cache_rcu(ca, devs, iter) \
- for ((iter) = 0; \
- ((ca) = cache_group_next_rcu((devs), &(iter))); \
- (iter)++)
-
-static inline struct cache *cache_group_next(struct cache_group *devs,
- unsigned *iter)
-{
- struct cache *ret;
-
- rcu_read_lock();
- if ((ret = cache_group_next_rcu(devs, iter)))
- percpu_ref_get(&ret->ref);
- rcu_read_unlock();
-
- return ret;
-}
-
#define group_for_each_cache(ca, devs, iter) \
- for ((iter) = 0; \
- (ca = cache_group_next(devs, &(iter))); \
- percpu_ref_put(&ca->ref), (iter)++)
+ for (({ lockdep_assert_held(&(devs)->lock); (iter) = 0; }); \
+ (iter) < (devs)->nr_devices && \
+ ((ca) = (devs)->devices[(iter)].dev, true); \
+ (iter)++)
void bch_check_mark_super_slowpath(struct cache_set *,
const struct bkey_i *, bool);
@@ -132,6 +103,7 @@ static inline bool bch_cache_may_remove(struct cache *ca)
{
struct cache_set *c = ca->set;
struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+ bool ret;
/*
* Right now, we can't remove the last device from a tier,
@@ -150,8 +122,11 @@ static inline bool bch_cache_may_remove(struct cache *ca)
* whole cache set RO.
*/
- return tier->nr_devices != 1 ||
- rcu_access_pointer(tier->devices[0]) != ca;
+ mutex_lock(&tier->lock);
+ ret = tier->nr_devices != 1 || tier->devices[0].dev != ca;
+ mutex_unlock(&tier->lock);
+
+ return ret;
}
void free_super(struct bcache_superblock *);
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 9a0d89d6223c..05a778c0f0c9 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -388,9 +388,9 @@ void bch_bio_free_pages(struct bio *bio)
__free_page(bv->bv_page);
}
-size_t bch_rand_range(size_t max)
+u64 bch_rand_range(u64 max)
{
- size_t rand;
+ u64 rand;
do {
get_random_bytes(&rand, sizeof(rand));
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 97eea8892572..757a575e28bd 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -625,7 +625,7 @@ do { \
_ret; \
})
-size_t bch_rand_range(size_t);
+u64 bch_rand_range(u64);
void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
void memcpy_from_bio(void *, struct bio *, struct bvec_iter);