summaryrefslogtreecommitdiff
path: root/fs/bcachefs/alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/alloc.c')
-rw-r--r--fs/bcachefs/alloc.c181
1 files changed, 81 insertions, 100 deletions
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index 3485019c535a..67de0d05535f 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -80,10 +80,10 @@ static void bch_cache_group_remove_cache(struct cache_group *grp, struct cache *
{
unsigned i;
- write_seqcount_begin(&grp->lock);
+ mutex_lock(&grp->lock);
for (i = 0; i < grp->nr_devices; i++)
- if (rcu_access_pointer(grp->devices[i]) == ca) {
+ if (grp->devices[i].dev == ca) {
grp->nr_devices--;
memmove(&grp->devices[i],
&grp->devices[i + 1],
@@ -91,16 +91,40 @@ static void bch_cache_group_remove_cache(struct cache_group *grp, struct cache *
break;
}
- write_seqcount_end(&grp->lock);
+ grp->next_alloc %= grp->nr_devices;
+
+ mutex_unlock(&grp->lock);
}
-static void bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca)
+__must_check
+static int bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca)
{
- write_seqcount_begin(&grp->lock);
- BUG_ON(grp->nr_devices >= MAX_CACHES_PER_SET);
+ mutex_lock(&grp->lock);
+
+ if (grp->nr_devices == grp->nr_devices_max) {
+ struct cache_group_entry *new_devices;
+ size_t new_max = grp->nr_devices_max
+ ? grp->nr_devices_max * 2
+ : 8;
+
+ new_devices = krealloc(grp->devices,
+ new_max * sizeof(grp->devices[0]),
+ GFP_KERNEL);
+ if (!new_devices) {
+ mutex_unlock(&grp->lock);
+ return -ENOMEM;
+ }
+
+ grp->devices = new_devices;
+ }
- rcu_assign_pointer(grp->devices[grp->nr_devices++], ca);
- write_seqcount_end(&grp->lock);
+ grp->devices[grp->nr_devices++] = (struct cache_group_entry) {
+ .dev = ca,
+ };
+
+ mutex_unlock(&grp->lock);
+
+ return 0;
}
/* Ratelimiting/PD controllers */
@@ -124,9 +148,9 @@ static void pd_controllers_update(struct work_struct *work)
memset(tier_free, 0, sizeof(tier_free));
memset(tier_dirty, 0, sizeof(tier_dirty));
- rcu_read_lock();
- for (i = CACHE_TIERS - 1; i >= 0; --i)
- group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
+ for (i = CACHE_TIERS - 1; i >= 0; --i) {
+ mutex_lock(&c->cache_tiers[i].lock);
+ group_for_each_cache(ca, &c->cache_tiers[i], iter) {
struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
unsigned bucket_bits = ca->bucket_bits + 9;
@@ -159,7 +183,8 @@ static void pd_controllers_update(struct work_struct *work)
tier_free[i] += free;
tier_dirty[i] += stats.buckets_dirty << bucket_bits;
}
- rcu_read_unlock();
+ mutex_unlock(&c->cache_tiers[i].lock);
+ }
if (tier_size[1]) {
u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
@@ -917,64 +942,11 @@ static void __bch_bucket_free(struct cache *ca, struct bucket *g)
enum bucket_alloc_ret {
ALLOC_SUCCESS,
- CACHE_SET_FULL, /* -ENOSPC */
+ CACHE_SET_FULL, /* No devices to allocate from */
BUCKETS_NOT_AVAILABLE, /* Device full */
FREELIST_EMPTY, /* Allocator thread not keeping up */
};
-static struct cache *bch_next_cache(struct cache_set *c,
- enum alloc_reserve reserve,
- struct cache_group *devs,
- long *cache_used)
-{
- struct cache *ca;
- size_t bucket_count = 0, rand;
- unsigned i;
-
- /*
- * first ptr allocation will always go to the specified tier,
- * 2nd and greater can go to any. If one tier is significantly larger
- * it is likely to go that tier.
- */
-
- group_for_each_cache_rcu(ca, devs, i) {
- if (test_bit(ca->sb.nr_this_dev, cache_used))
- continue;
-
- bucket_count += buckets_free_cache(ca, reserve);
- }
-
- if (!bucket_count)
- return ERR_PTR(-BUCKETS_NOT_AVAILABLE);
-
- /*
- * We create a weighted selection by using the number of free buckets
- * in each cache. You can think of this like lining up the caches
- * linearly so each as a given range, corresponding to the number of
- * free buckets in that cache, and then randomly picking a number
- * within that range.
- */
-
- rand = bch_rand_range(bucket_count);
-
- group_for_each_cache_rcu(ca, devs, i) {
- if (test_bit(ca->sb.nr_this_dev, cache_used))
- continue;
-
- bucket_count -= buckets_free_cache(ca, reserve);
-
- if (rand >= bucket_count)
- return ca;
- }
-
- /*
- * If we fall off the end, it means we raced because of bucket counters
- * changing - return NULL so __bch_bucket_alloc_set() knows to retry
- */
-
- return NULL;
-}
-
/*
* XXX: change the alloc_group to explicitly round robin across devices
*/
@@ -986,48 +958,45 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
long *caches_used)
{
enum bucket_alloc_ret ret;
+ struct cache_group_entry *i;
+ unsigned idx, start_idx;
+ u64 buckets_free_total = 0, buckets_free_fraction, r;
+ bool first_loop = true;
BUG_ON(nr_replicas > BCH_REPLICAS_MAX);
- if (!devs->nr_devices)
+ mutex_lock(&devs->lock);
+
+ if (!devs->nr_devices) {
+ mutex_unlock(&devs->lock);
return CACHE_SET_FULL;
+ }
- rcu_read_lock();
+ for (i = devs->devices;
+ i < devs->devices + devs->nr_devices;
+ i++) {
+ i->buckets_free = buckets_free_cache(i->dev, reserve);
+ buckets_free_total += i->buckets_free;
+ }
- /* sort by free space/prio of oldest data in caches */
+ idx = start_idx = devs->next_alloc;
while (ob->nr_ptrs < nr_replicas) {
struct cache *ca;
- unsigned seq;
- size_t r;
- /* first ptr goes to the specified tier, the rest to any */
- do {
- struct cache_group *d;
+ i = &devs->devices[idx];
+ ca = i->dev;
- seq = read_seqcount_begin(&devs->lock);
+ if (test_bit(ca->sb.nr_this_dev, caches_used))
+ goto next;
- d = (!ob->nr_ptrs && devs == &c->cache_all &&
- c->cache_tiers[0].nr_devices)
- ? &c->cache_tiers[0]
- : devs;
+ buckets_free_fraction = (u64) i->buckets_free *
+ devs->nr_devices;
- ca = devs->nr_devices
- ? bch_next_cache(c, reserve, d, caches_used)
- : ERR_PTR(-CACHE_SET_FULL);
-
- /*
- * If ca == NULL, we raced because of bucket counters
- * changing
- */
- } while (read_seqcount_retry(&devs->lock, seq) || !ca);
-
- if (IS_ERR(ca)) {
- ret = -PTR_ERR(ca);
- goto err;
- }
-
- __set_bit(ca->sb.nr_this_dev, caches_used);
+ if (first_loop &&
+ (buckets_free_fraction < buckets_free_total &&
+ buckets_free_fraction < bch_rand_range(buckets_free_total)))
+ goto next;
r = bch_bucket_alloc(ca, reserve);
if (!r) {
@@ -1035,6 +1004,8 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
goto err;
}
+ __set_bit(ca->sb.nr_this_dev, caches_used);
+
/*
* open_bucket_add_buckets expects new pointers at the head of
* the list:
@@ -1048,9 +1019,19 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
.offset = bucket_to_sector(ca, r),
.dev = ca->sb.nr_this_dev,
};
+next:
+ idx++;
+ idx %= devs->nr_devices;
+
+ if (idx == start_idx) {
+ if (!first_loop)
+ break;
+ first_loop = false;
+ }
}
- rcu_read_unlock();
+ mutex_unlock(&devs->lock);
+
return ALLOC_SUCCESS;
err:
rcu_read_unlock();
@@ -1785,7 +1766,10 @@ void bch_open_buckets_init(struct cache_set *c)
list_add(&c->open_buckets[i].list, &c->open_buckets_free);
}
- seqcount_init(&c->cache_all.lock);
+ mutex_init(&c->cache_all.lock);
+
+ for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
+ mutex_init(&c->cache_tiers[i].lock);
for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
c->write_points[i].throttle = true;
@@ -1793,9 +1777,6 @@ void bch_open_buckets_init(struct cache_set *c)
c->write_points[i].group = &c->cache_tiers[0];
}
- for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
- seqcount_init(&c->cache_tiers[i].lock);
-
c->promote_write_point.group = &c->cache_tiers[0];
c->promote_write_point.reserve = RESERVE_NONE;