diff options
Diffstat (limited to 'fs/bcachefs/alloc.c')
-rw-r--r-- | fs/bcachefs/alloc.c | 181 |
1 files changed, 81 insertions, 100 deletions
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c index 3485019c535a..67de0d05535f 100644 --- a/fs/bcachefs/alloc.c +++ b/fs/bcachefs/alloc.c @@ -80,10 +80,10 @@ static void bch_cache_group_remove_cache(struct cache_group *grp, struct cache * { unsigned i; - write_seqcount_begin(&grp->lock); + mutex_lock(&grp->lock); for (i = 0; i < grp->nr_devices; i++) - if (rcu_access_pointer(grp->devices[i]) == ca) { + if (grp->devices[i].dev == ca) { grp->nr_devices--; memmove(&grp->devices[i], &grp->devices[i + 1], @@ -91,16 +91,40 @@ static void bch_cache_group_remove_cache(struct cache_group *grp, struct cache * break; } - write_seqcount_end(&grp->lock); + grp->next_alloc %= grp->nr_devices; + + mutex_unlock(&grp->lock); } -static void bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca) +__must_check +static int bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca) { - write_seqcount_begin(&grp->lock); - BUG_ON(grp->nr_devices >= MAX_CACHES_PER_SET); + mutex_lock(&grp->lock); + + if (grp->nr_devices == grp->nr_devices_max) { + struct cache_group_entry *new_devices; + size_t new_max = grp->nr_devices_max + ? grp->nr_devices_max * 2 + : 8; + + new_devices = krealloc(grp->devices, + new_max * sizeof(grp->devices[0]), + GFP_KERNEL); + if (!new_devices) { + mutex_unlock(&grp->lock); + return -ENOMEM; + } + + grp->devices = new_devices; + } - rcu_assign_pointer(grp->devices[grp->nr_devices++], ca); - write_seqcount_end(&grp->lock); + grp->devices[grp->nr_devices++] = (struct cache_group_entry) { + .dev = ca, + }; + + mutex_unlock(&grp->lock); + + return 0; } /* Ratelimiting/PD controllers */ @@ -124,9 +148,9 @@ static void pd_controllers_update(struct work_struct *work) memset(tier_free, 0, sizeof(tier_free)); memset(tier_dirty, 0, sizeof(tier_dirty)); - rcu_read_lock(); - for (i = CACHE_TIERS - 1; i >= 0; --i) - group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) { + for (i = CACHE_TIERS - 1; i >= 0; --i) { + mutex_lock(&c->cache_tiers[i].lock); + group_for_each_cache(ca, &c->cache_tiers[i], iter) { struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca); unsigned bucket_bits = ca->bucket_bits + 9; @@ -159,7 +183,8 @@ static void pd_controllers_update(struct work_struct *work) tier_free[i] += free; tier_dirty[i] += stats.buckets_dirty << bucket_bits; } - rcu_read_unlock(); + mutex_unlock(&c->cache_tiers[i].lock); + } if (tier_size[1]) { u64 target = div_u64(tier_size[0] * c->tiering_percent, 100); @@ -917,64 +942,11 @@ static void __bch_bucket_free(struct cache *ca, struct bucket *g) enum bucket_alloc_ret { ALLOC_SUCCESS, - CACHE_SET_FULL, /* -ENOSPC */ + CACHE_SET_FULL, /* No devices to allocate from */ BUCKETS_NOT_AVAILABLE, /* Device full */ FREELIST_EMPTY, /* Allocator thread not keeping up */ }; -static struct cache *bch_next_cache(struct cache_set *c, - enum alloc_reserve reserve, - struct cache_group *devs, - long *cache_used) -{ - struct cache *ca; - size_t bucket_count = 0, rand; - unsigned i; - - /* - * first ptr allocation will always go to the specified tier, - * 2nd and greater can go to any. If one tier is significantly larger - * it is likely to go that tier. - */ - - group_for_each_cache_rcu(ca, devs, i) { - if (test_bit(ca->sb.nr_this_dev, cache_used)) - continue; - - bucket_count += buckets_free_cache(ca, reserve); - } - - if (!bucket_count) - return ERR_PTR(-BUCKETS_NOT_AVAILABLE); - - /* - * We create a weighted selection by using the number of free buckets - * in each cache. You can think of this like lining up the caches - * linearly so each as a given range, corresponding to the number of - * free buckets in that cache, and then randomly picking a number - * within that range. - */ - - rand = bch_rand_range(bucket_count); - - group_for_each_cache_rcu(ca, devs, i) { - if (test_bit(ca->sb.nr_this_dev, cache_used)) - continue; - - bucket_count -= buckets_free_cache(ca, reserve); - - if (rand >= bucket_count) - return ca; - } - - /* - * If we fall off the end, it means we raced because of bucket counters - * changing - return NULL so __bch_bucket_alloc_set() knows to retry - */ - - return NULL; -} - /* * XXX: change the alloc_group to explicitly round robin across devices */ @@ -986,48 +958,45 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c, long *caches_used) { enum bucket_alloc_ret ret; + struct cache_group_entry *i; + unsigned idx, start_idx; + u64 buckets_free_total = 0, buckets_free_fraction, r; + bool first_loop = true; BUG_ON(nr_replicas > BCH_REPLICAS_MAX); - if (!devs->nr_devices) + mutex_lock(&devs->lock); + + if (!devs->nr_devices) { + mutex_unlock(&devs->lock); return CACHE_SET_FULL; + } - rcu_read_lock(); + for (i = devs->devices; + i < devs->devices + devs->nr_devices; + i++) { + i->buckets_free = buckets_free_cache(i->dev, reserve); + buckets_free_total += i->buckets_free; + } - /* sort by free space/prio of oldest data in caches */ + idx = start_idx = devs->next_alloc; while (ob->nr_ptrs < nr_replicas) { struct cache *ca; - unsigned seq; - size_t r; - /* first ptr goes to the specified tier, the rest to any */ - do { - struct cache_group *d; + i = &devs->devices[idx]; + ca = i->dev; - seq = read_seqcount_begin(&devs->lock); + if (test_bit(ca->sb.nr_this_dev, caches_used)) + goto next; - d = (!ob->nr_ptrs && devs == &c->cache_all && - c->cache_tiers[0].nr_devices) - ? &c->cache_tiers[0] - : devs; + buckets_free_fraction = (u64) i->buckets_free * + devs->nr_devices; - ca = devs->nr_devices - ? bch_next_cache(c, reserve, d, caches_used) - : ERR_PTR(-CACHE_SET_FULL); - - /* - * If ca == NULL, we raced because of bucket counters - * changing - */ - } while (read_seqcount_retry(&devs->lock, seq) || !ca); - - if (IS_ERR(ca)) { - ret = -PTR_ERR(ca); - goto err; - } - - __set_bit(ca->sb.nr_this_dev, caches_used); + if (first_loop && + (buckets_free_fraction < buckets_free_total && + buckets_free_fraction < bch_rand_range(buckets_free_total))) + goto next; r = bch_bucket_alloc(ca, reserve); if (!r) { @@ -1035,6 +1004,8 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c, goto err; } + __set_bit(ca->sb.nr_this_dev, caches_used); + /* * open_bucket_add_buckets expects new pointers at the head of * the list: @@ -1048,9 +1019,19 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c, .offset = bucket_to_sector(ca, r), .dev = ca->sb.nr_this_dev, }; +next: + idx++; + idx %= devs->nr_devices; + + if (idx == start_idx) { + if (!first_loop) + break; + first_loop = false; + } } - rcu_read_unlock(); + mutex_unlock(&devs->lock); + return ALLOC_SUCCESS; err: rcu_read_unlock(); @@ -1785,7 +1766,10 @@ void bch_open_buckets_init(struct cache_set *c) list_add(&c->open_buckets[i].list, &c->open_buckets_free); } - seqcount_init(&c->cache_all.lock); + mutex_init(&c->cache_all.lock); + + for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++) + mutex_init(&c->cache_tiers[i].lock); for (i = 0; i < ARRAY_SIZE(c->write_points); i++) { c->write_points[i].throttle = true; @@ -1793,9 +1777,6 @@ void bch_open_buckets_init(struct cache_set *c) c->write_points[i].group = &c->cache_tiers[0]; } - for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++) - seqcount_init(&c->cache_tiers[i].lock); - c->promote_write_point.group = &c->cache_tiers[0]; c->promote_write_point.reserve = RESERVE_NONE; |