diff options
-rw-r--r-- | drivers/md/bcache/alloc.c | 145 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.h | 40 | ||||
-rw-r--r-- | drivers/md/bcache/alloc_types.h | 10 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 6 | ||||
-rw-r--r-- | drivers/md/bcache/buckets.h | 17 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/super.h | 37 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/tier.c | 5 |
10 files changed, 128 insertions, 139 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index b9b1e171fc9d..47a6d8859193 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -68,6 +68,7 @@ #include <linux/blkdev.h> #include <linux/kthread.h> +#include <linux/random.h> #include <linux/rcupdate.h> #include <trace/events/bcache.h> @@ -80,27 +81,27 @@ static void bch_cache_group_remove_cache(struct cache_group *grp, struct cache * { unsigned i; - write_seqcount_begin(&grp->lock); + spin_lock(&grp->lock); for (i = 0; i < grp->nr_devices; i++) - if (rcu_access_pointer(grp->devices[i]) == ca) { + if (rcu_access_pointer(grp->d[i].dev) == ca) { grp->nr_devices--; - memmove(&grp->devices[i], - &grp->devices[i + 1], - (grp->nr_devices - i) * sizeof(ca)); + memmove(&grp->d[i], + &grp->d[i + 1], + (grp->nr_devices - i) * sizeof(grp->d[0])); break; } - write_seqcount_end(&grp->lock); + spin_unlock(&grp->lock); } static void bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca) { - write_seqcount_begin(&grp->lock); + spin_lock(&grp->lock); BUG_ON(grp->nr_devices >= MAX_CACHES_PER_SET); - rcu_assign_pointer(grp->devices[grp->nr_devices++], ca); - write_seqcount_end(&grp->lock); + rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca); + spin_unlock(&grp->lock); } /* Ratelimiting/PD controllers */ @@ -143,8 +144,7 @@ static void pd_controllers_update(struct work_struct *work) u64 dev_size = (ca->mi.nbuckets - ca->mi.first_bucket) << bucket_bits; - u64 free = __buckets_free_cache(ca, stats, - RESERVE_NONE) << bucket_bits; + u64 free = __buckets_free_cache(ca, stats) << bucket_bits; if (fragmented < 0) fragmented = 0; @@ -921,7 +921,6 @@ static void __bch_bucket_free(struct cache *ca, struct bucket *g) enum bucket_alloc_ret { ALLOC_SUCCESS, CACHE_SET_FULL, /* -ENOSPC */ - BUCKETS_NOT_AVAILABLE, /* Device full */ FREELIST_EMPTY, /* Allocator thread not keeping up */ }; @@ -931,50 +930,63 @@ static struct cache *bch_next_cache(struct cache_set *c, long *cache_used) { struct cache *ca; - size_t bucket_count = 0, rand; - unsigned i; - - /* - * first ptr allocation will always go to the specified tier, - * 2nd and greater can go to any. If one tier is significantly larger - * it is likely to go that tier. - */ + unsigned i, weight; + u64 available_buckets = 0; - group_for_each_cache_rcu(ca, devs, i) { - if (test_bit(ca->sb.nr_this_dev, cache_used)) - continue; - - bucket_count += buckets_free_cache(ca, reserve); - } + spin_lock(&devs->lock); - if (!bucket_count) - return ERR_PTR(-BUCKETS_NOT_AVAILABLE); + if (devs->nr_devices == 0) + goto err; - /* - * We create a weighted selection by using the number of free buckets - * in each cache. You can think of this like lining up the caches - * linearly so each as a given range, corresponding to the number of - * free buckets in that cache, and then randomly picking a number - * within that range. - */ - - rand = bch_rand_range(bucket_count); - - group_for_each_cache_rcu(ca, devs, i) { + if (devs->nr_devices == 1) { + ca = devs->d[0].dev; if (test_bit(ca->sb.nr_this_dev, cache_used)) - continue; + goto err; + goto out; + } - bucket_count -= buckets_free_cache(ca, reserve); + /* recalculate weightings: XXX don't do this on every call */ + for (i = 0; i < devs->nr_devices; i++) { + ca = devs->d[i].dev; - if (rand >= bucket_count) - return ca; + devs->d[i].weight = buckets_free_cache(ca); + available_buckets += devs->d[i].weight; } - /* - * If we fall off the end, it means we raced because of bucket counters - * changing - return NULL so __bch_bucket_alloc_set() knows to retry - */ + for (i = 0; i < devs->nr_devices; i++) { + const unsigned min_weight = U32_MAX >> 4; + const unsigned max_weight = U32_MAX; + + devs->d[i].weight = + min_weight + + div64_u64(devs->d[i].weight * + devs->nr_devices * + (max_weight - min_weight), + available_buckets); + devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight); + } + for (i = 0; i < devs->nr_devices; i++) + if (!test_bit(devs->d[i].dev->sb.nr_this_dev, cache_used)) + goto available; + + /* no unused devices: */ + goto err; +available: + i = devs->cur_device; + do { + weight = devs->d[i].weight; + ca = devs->d[i].dev; + i++; + i %= devs->nr_devices; + } while (test_bit(ca->sb.nr_this_dev, cache_used) || + get_random_int() > weight); + devs->cur_device = i; +out: + spin_unlock(&devs->lock); + return ca; +err: + spin_unlock(&devs->lock); return NULL; } @@ -1000,38 +1012,23 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c, /* sort by free space/prio of oldest data in caches */ while (ob->nr_ptrs < nr_replicas) { + struct cache_group *d; struct cache *ca; - unsigned seq; size_t r; /* first ptr goes to the specified tier, the rest to any */ - do { - struct cache_group *d; - - seq = read_seqcount_begin(&devs->lock); - - d = (!ob->nr_ptrs && devs == &c->cache_all && - c->cache_tiers[0].nr_devices) - ? &c->cache_tiers[0] - : devs; - ca = devs->nr_devices - ? bch_next_cache(c, reserve, d, caches_used) - : ERR_PTR(-CACHE_SET_FULL); + d = (!ob->nr_ptrs && devs == &c->cache_all && + c->cache_tiers[0].nr_devices) + ? &c->cache_tiers[0] + : devs; - /* - * If ca == NULL, we raced because of bucket counters - * changing - */ - } while (read_seqcount_retry(&devs->lock, seq) || !ca); - - if (IS_ERR(ca)) { - ret = -PTR_ERR(ca); + ca = bch_next_cache(c, reserve, d, caches_used); + if (!ca) { + ret = CACHE_SET_FULL; goto err; } - __set_bit(ca->sb.nr_this_dev, caches_used); - r = bch_bucket_alloc(ca, reserve); if (!r) { ret = FREELIST_EMPTY; @@ -1051,6 +1048,8 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c, .offset = bucket_to_sector(ca, r), .dev = ca->sb.nr_this_dev, }; + + __set_bit(ca->sb.nr_this_dev, caches_used); } rcu_read_unlock(); @@ -1084,8 +1083,6 @@ static int bch_bucket_alloc_set(struct cache_set *c, struct open_bucket *ob, closure_wake_up(&c->freelist_wait); return -ENOSPC; - case BUCKETS_NOT_AVAILABLE: - trace_bcache_buckets_unavailable_fail(c, reserve, cl); case FREELIST_EMPTY: if (!cl) return -ENOSPC; @@ -1807,7 +1804,7 @@ void bch_open_buckets_init(struct cache_set *c) list_add(&c->open_buckets[i].list, &c->open_buckets_free); } - seqcount_init(&c->cache_all.lock); + spin_lock_init(&c->cache_all.lock); for (i = 0; i < ARRAY_SIZE(c->write_points); i++) { c->write_points[i].throttle = true; @@ -1816,7 +1813,7 @@ void bch_open_buckets_init(struct cache_set *c) } for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++) - seqcount_init(&c->cache_tiers[i].lock); + spin_lock_init(&c->cache_tiers[i].lock); c->promote_write_point.group = &c->cache_tiers[0]; c->promote_write_point.reserve = RESERVE_NONE; diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h index 3da349c581b3..b02a307b0265 100644 --- a/drivers/md/bcache/alloc.h +++ b/drivers/md/bcache/alloc.h @@ -35,15 +35,43 @@ static inline void bch_wake_allocator(struct cache *ca) if ((p = ACCESS_ONCE(ca->alloc_thread))) wake_up_process(p); rcu_read_unlock(); +} + +static inline struct cache *cache_group_next_rcu(struct cache_group *devs, + unsigned *iter) +{ + struct cache *ret = NULL; + + while (*iter < devs->nr_devices && + !(ret = rcu_dereference(devs->d[*iter].dev))) + (*iter)++; - /* - * XXX: this is only needed because of ca->reserve_buckets_count, but is - * reserve_buckets_count needed anymore? It predates modern - * reservations. - */ - closure_wake_up(&ca->set->freelist_wait); + return ret; } +#define group_for_each_cache_rcu(ca, devs, iter) \ + for ((iter) = 0; \ + ((ca) = cache_group_next_rcu((devs), &(iter))); \ + (iter)++) + +static inline struct cache *cache_group_next(struct cache_group *devs, + unsigned *iter) +{ + struct cache *ret; + + rcu_read_lock(); + if ((ret = cache_group_next_rcu(devs, iter))) + percpu_ref_get(&ret->ref); + rcu_read_unlock(); + + return ret; +} + +#define group_for_each_cache(ca, devs, iter) \ + for ((iter) = 0; \ + (ca = cache_group_next(devs, &(iter))); \ + percpu_ref_put(&ca->ref), (iter)++) + #define __open_bucket_next_online_device(_c, _ob, _ptr, _ca) \ ({ \ (_ca) = NULL; \ diff --git a/drivers/md/bcache/alloc_types.h b/drivers/md/bcache/alloc_types.h index 065b9c02f185..0bfafbf8c91f 100644 --- a/drivers/md/bcache/alloc_types.h +++ b/drivers/md/bcache/alloc_types.h @@ -49,6 +49,16 @@ static inline bool allocation_is_metadata(enum alloc_reserve id) return id <= RESERVE_METADATA_LAST; } +struct cache_group { + spinlock_t lock; + unsigned nr_devices; + unsigned cur_device; + struct { + u64 weight; + struct cache __rcu *dev; + } d[MAX_CACHES_PER_SET]; +}; + /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ #define OPEN_BUCKETS_COUNT 256 diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 318616955570..aee5451a762d 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -321,12 +321,6 @@ struct gc_pos { unsigned level; }; -struct cache_group { - seqcount_t lock; - unsigned nr_devices; - struct cache __rcu *devices[MAX_CACHES_PER_SET]; -}; - struct cache_member_cpu { u64 nbuckets; /* device size */ u16 first_bucket; /* index of first bucket used */ diff --git a/drivers/md/bcache/buckets.h b/drivers/md/bcache/buckets.h index da077a3fe5a6..42fcd620cefe 100644 --- a/drivers/md/bcache/buckets.h +++ b/drivers/md/bcache/buckets.h @@ -171,23 +171,16 @@ static inline size_t buckets_available_cache(struct cache *ca) } static inline size_t __buckets_free_cache(struct cache *ca, - struct bucket_stats_cache stats, - enum alloc_reserve reserve) + struct bucket_stats_cache stats) { - size_t free = __buckets_available_cache(ca, stats) + - fifo_used(&ca->free[reserve]) + + return __buckets_available_cache(ca, stats) + + fifo_used(&ca->free[RESERVE_NONE]) + fifo_used(&ca->free_inc); - - if (reserve == RESERVE_NONE) - free = max_t(ssize_t, 0, free - ca->reserve_buckets_count); - - return free; } -static inline size_t buckets_free_cache(struct cache *ca, - enum alloc_reserve reserve) +static inline size_t buckets_free_cache(struct cache *ca) { - return __buckets_free_cache(ca, bch_bucket_stats_read_cache(ca), reserve); + return __buckets_free_cache(ca, bch_bucket_stats_read_cache(ca)); } /* Cache set stats: */ diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index fd93cbfe2a19..bebc4e99839c 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -5,6 +5,7 @@ */ #include "bcache.h" +#include "alloc.h" #include "bkey_methods.h" #include "buckets.h" #include "btree_gc.h" diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f2fb9567b4b5..1b0663297fab 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1904,9 +1904,9 @@ static const char *cache_alloc(struct bcache_superblock *sb, kobject_init(&ca->kobj, &bch_cache_ktype); - seqcount_init(&ca->self.lock); + spin_lock_init(&ca->self.lock); ca->self.nr_devices = 1; - rcu_assign_pointer(ca->self.devices[0], ca); + rcu_assign_pointer(ca->self.d[0].dev, ca); ca->sb.nr_this_dev = sb->sb->nr_this_dev; INIT_WORK(&ca->free_work, bch_cache_free_work); diff --git a/drivers/md/bcache/super.h b/drivers/md/bcache/super.h index 517047938aa2..635e1a6f5cf3 100644 --- a/drivers/md/bcache/super.h +++ b/drivers/md/bcache/super.h @@ -59,41 +59,6 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c, (ca = bch_get_next_cache(c, &(iter))); \ percpu_ref_put(&ca->ref), (iter)++) -static inline struct cache *cache_group_next_rcu(struct cache_group *devs, - unsigned *iter) -{ - struct cache *ret = NULL; - - while (*iter < devs->nr_devices && - !(ret = rcu_dereference(devs->devices[*iter]))) - (*iter)++; - - return ret; -} - -#define group_for_each_cache_rcu(ca, devs, iter) \ - for ((iter) = 0; \ - ((ca) = cache_group_next_rcu((devs), &(iter))); \ - (iter)++) - -static inline struct cache *cache_group_next(struct cache_group *devs, - unsigned *iter) -{ - struct cache *ret; - - rcu_read_lock(); - if ((ret = cache_group_next_rcu(devs, iter))) - percpu_ref_get(&ret->ref); - rcu_read_unlock(); - - return ret; -} - -#define group_for_each_cache(ca, devs, iter) \ - for ((iter) = 0; \ - (ca = cache_group_next(devs, &(iter))); \ - percpu_ref_put(&ca->ref), (iter)++) - void bch_check_mark_super_slowpath(struct cache_set *, const struct bkey_i *, bool); @@ -151,7 +116,7 @@ static inline bool bch_cache_may_remove(struct cache *ca) */ return tier->nr_devices != 1 || - rcu_access_pointer(tier->devices[0]) != ca; + rcu_access_pointer(tier->d[0].dev) != ca; } void free_super(struct bcache_superblock *); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 52ebb718e376..60436b0ed3a1 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -1205,7 +1205,7 @@ SHOW(bch_cache) sysfs_print(meta_buckets, stats.buckets_meta); sysfs_print(alloc_buckets, stats.buckets_alloc); sysfs_print(available_buckets, buckets_available_cache(ca)); - sysfs_print(free_buckets, buckets_free_cache(ca, RESERVE_NONE)); + sysfs_print(free_buckets, buckets_free_cache(ca)); sysfs_print(has_data, ca->mi.has_data); sysfs_print(has_metadata, ca->mi.has_metadata); diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c index 524a304c367f..02ceb586239d 100644 --- a/drivers/md/bcache/tier.c +++ b/drivers/md/bcache/tier.c @@ -1,5 +1,6 @@ #include "bcache.h" +#include "alloc.h" #include "btree_iter.h" #include "buckets.h" #include "clock.h" @@ -79,7 +80,7 @@ static void refill_next(struct cache_set *c, struct tiering_refill *refill) while (1) { while (refill->cache_iter < tier->nr_devices) { refill->ca = rcu_dereference( - tier->devices[refill->cache_iter]); + tier->d[refill->cache_iter].dev); if (refill->ca != NULL) { percpu_ref_get(&refill->ca->ref); goto out; @@ -274,7 +275,7 @@ static int tiering_next_cache(struct cache_set *c, continue; } - ca = rcu_dereference(tier->devices[*cache_iter]); + ca = rcu_dereference(tier->d[*cache_iter].dev); if (ca == NULL || ca->mi.state != CACHE_ACTIVE || ca->tiering_queue.stopped) { |