diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-02-20 04:27:56 -0900 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2016-08-28 19:16:16 -0800 |
commit | aae4a1ff28578bbb1346deedfffd5152b5b2a49f (patch) | |
tree | 3a0bcc0e78f65dff676e18952f2cbc5a530c89a2 | |
parent | 8257a6962f0ce690672fc228d220c82126a54f35 (diff) |
rework allocator round robiningbcache-dev-wip
-rw-r--r-- | fs/bcachefs/alloc.c | 181 | ||||
-rw-r--r-- | fs/bcachefs/bcache.h | 11 | ||||
-rw-r--r-- | fs/bcachefs/journal.c | 12 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/super.h | 45 | ||||
-rw-r--r-- | fs/bcachefs/util.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/util.h | 2 |
7 files changed, 116 insertions, 143 deletions
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c index 3485019c535a..67de0d05535f 100644 --- a/fs/bcachefs/alloc.c +++ b/fs/bcachefs/alloc.c @@ -80,10 +80,10 @@ static void bch_cache_group_remove_cache(struct cache_group *grp, struct cache * { unsigned i; - write_seqcount_begin(&grp->lock); + mutex_lock(&grp->lock); for (i = 0; i < grp->nr_devices; i++) - if (rcu_access_pointer(grp->devices[i]) == ca) { + if (grp->devices[i].dev == ca) { grp->nr_devices--; memmove(&grp->devices[i], &grp->devices[i + 1], @@ -91,16 +91,40 @@ static void bch_cache_group_remove_cache(struct cache_group *grp, struct cache * break; } - write_seqcount_end(&grp->lock); + grp->next_alloc %= grp->nr_devices; + + mutex_unlock(&grp->lock); } -static void bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca) +__must_check +static int bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca) { - write_seqcount_begin(&grp->lock); - BUG_ON(grp->nr_devices >= MAX_CACHES_PER_SET); + mutex_lock(&grp->lock); + + if (grp->nr_devices == grp->nr_devices_max) { + struct cache_group_entry *new_devices; + size_t new_max = grp->nr_devices_max + ? grp->nr_devices_max * 2 + : 8; + + new_devices = krealloc(grp->devices, + new_max * sizeof(grp->devices[0]), + GFP_KERNEL); + if (!new_devices) { + mutex_unlock(&grp->lock); + return -ENOMEM; + } + + grp->devices = new_devices; + } - rcu_assign_pointer(grp->devices[grp->nr_devices++], ca); - write_seqcount_end(&grp->lock); + grp->devices[grp->nr_devices++] = (struct cache_group_entry) { + .dev = ca, + }; + + mutex_unlock(&grp->lock); + + return 0; } /* Ratelimiting/PD controllers */ @@ -124,9 +148,9 @@ static void pd_controllers_update(struct work_struct *work) memset(tier_free, 0, sizeof(tier_free)); memset(tier_dirty, 0, sizeof(tier_dirty)); - rcu_read_lock(); - for (i = CACHE_TIERS - 1; i >= 0; --i) - group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) { + for (i = CACHE_TIERS - 1; i >= 0; --i) { + mutex_lock(&c->cache_tiers[i].lock); + group_for_each_cache(ca, &c->cache_tiers[i], iter) { struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca); unsigned bucket_bits = ca->bucket_bits + 9; @@ -159,7 +183,8 @@ static void pd_controllers_update(struct work_struct *work) tier_free[i] += free; tier_dirty[i] += stats.buckets_dirty << bucket_bits; } - rcu_read_unlock(); + mutex_unlock(&c->cache_tiers[i].lock); + } if (tier_size[1]) { u64 target = div_u64(tier_size[0] * c->tiering_percent, 100); @@ -917,64 +942,11 @@ static void __bch_bucket_free(struct cache *ca, struct bucket *g) enum bucket_alloc_ret { ALLOC_SUCCESS, - CACHE_SET_FULL, /* -ENOSPC */ + CACHE_SET_FULL, /* No devices to allocate from */ BUCKETS_NOT_AVAILABLE, /* Device full */ FREELIST_EMPTY, /* Allocator thread not keeping up */ }; -static struct cache *bch_next_cache(struct cache_set *c, - enum alloc_reserve reserve, - struct cache_group *devs, - long *cache_used) -{ - struct cache *ca; - size_t bucket_count = 0, rand; - unsigned i; - - /* - * first ptr allocation will always go to the specified tier, - * 2nd and greater can go to any. If one tier is significantly larger - * it is likely to go that tier. - */ - - group_for_each_cache_rcu(ca, devs, i) { - if (test_bit(ca->sb.nr_this_dev, cache_used)) - continue; - - bucket_count += buckets_free_cache(ca, reserve); - } - - if (!bucket_count) - return ERR_PTR(-BUCKETS_NOT_AVAILABLE); - - /* - * We create a weighted selection by using the number of free buckets - * in each cache. You can think of this like lining up the caches - * linearly so each as a given range, corresponding to the number of - * free buckets in that cache, and then randomly picking a number - * within that range. - */ - - rand = bch_rand_range(bucket_count); - - group_for_each_cache_rcu(ca, devs, i) { - if (test_bit(ca->sb.nr_this_dev, cache_used)) - continue; - - bucket_count -= buckets_free_cache(ca, reserve); - - if (rand >= bucket_count) - return ca; - } - - /* - * If we fall off the end, it means we raced because of bucket counters - * changing - return NULL so __bch_bucket_alloc_set() knows to retry - */ - - return NULL; -} - /* * XXX: change the alloc_group to explicitly round robin across devices */ @@ -986,48 +958,45 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c, long *caches_used) { enum bucket_alloc_ret ret; + struct cache_group_entry *i; + unsigned idx, start_idx; + u64 buckets_free_total = 0, buckets_free_fraction, r; + bool first_loop = true; BUG_ON(nr_replicas > BCH_REPLICAS_MAX); - if (!devs->nr_devices) + mutex_lock(&devs->lock); + + if (!devs->nr_devices) { + mutex_unlock(&devs->lock); return CACHE_SET_FULL; + } - rcu_read_lock(); + for (i = devs->devices; + i < devs->devices + devs->nr_devices; + i++) { + i->buckets_free = buckets_free_cache(i->dev, reserve); + buckets_free_total += i->buckets_free; + } - /* sort by free space/prio of oldest data in caches */ + idx = start_idx = devs->next_alloc; while (ob->nr_ptrs < nr_replicas) { struct cache *ca; - unsigned seq; - size_t r; - /* first ptr goes to the specified tier, the rest to any */ - do { - struct cache_group *d; + i = &devs->devices[idx]; + ca = i->dev; - seq = read_seqcount_begin(&devs->lock); + if (test_bit(ca->sb.nr_this_dev, caches_used)) + goto next; - d = (!ob->nr_ptrs && devs == &c->cache_all && - c->cache_tiers[0].nr_devices) - ? &c->cache_tiers[0] - : devs; + buckets_free_fraction = (u64) i->buckets_free * + devs->nr_devices; - ca = devs->nr_devices - ? bch_next_cache(c, reserve, d, caches_used) - : ERR_PTR(-CACHE_SET_FULL); - - /* - * If ca == NULL, we raced because of bucket counters - * changing - */ - } while (read_seqcount_retry(&devs->lock, seq) || !ca); - - if (IS_ERR(ca)) { - ret = -PTR_ERR(ca); - goto err; - } - - __set_bit(ca->sb.nr_this_dev, caches_used); + if (first_loop && + (buckets_free_fraction < buckets_free_total && + buckets_free_fraction < bch_rand_range(buckets_free_total))) + goto next; r = bch_bucket_alloc(ca, reserve); if (!r) { @@ -1035,6 +1004,8 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c, goto err; } + __set_bit(ca->sb.nr_this_dev, caches_used); + /* * open_bucket_add_buckets expects new pointers at the head of * the list: @@ -1048,9 +1019,19 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c, .offset = bucket_to_sector(ca, r), .dev = ca->sb.nr_this_dev, }; +next: + idx++; + idx %= devs->nr_devices; + + if (idx == start_idx) { + if (!first_loop) + break; + first_loop = false; + } } - rcu_read_unlock(); + mutex_unlock(&devs->lock); + return ALLOC_SUCCESS; err: rcu_read_unlock(); @@ -1785,7 +1766,10 @@ void bch_open_buckets_init(struct cache_set *c) list_add(&c->open_buckets[i].list, &c->open_buckets_free); } - seqcount_init(&c->cache_all.lock); + mutex_init(&c->cache_all.lock); + + for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++) + mutex_init(&c->cache_tiers[i].lock); for (i = 0; i < ARRAY_SIZE(c->write_points); i++) { c->write_points[i].throttle = true; @@ -1793,9 +1777,6 @@ void bch_open_buckets_init(struct cache_set *c) c->write_points[i].group = &c->cache_tiers[0]; } - for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++) - seqcount_init(&c->cache_tiers[i].lock); - c->promote_write_point.group = &c->cache_tiers[0]; c->promote_write_point.reserve = RESERVE_NONE; diff --git a/fs/bcachefs/bcache.h b/fs/bcachefs/bcache.h index a6bbd38f4316..37237109299a 100644 --- a/fs/bcachefs/bcache.h +++ b/fs/bcachefs/bcache.h @@ -314,10 +314,17 @@ struct gc_pos { unsigned level; }; +struct cache_group_entry { + struct cache *dev; + u64 buckets_free; +}; + struct cache_group { - seqcount_t lock; + struct mutex lock; unsigned nr_devices; - struct cache __rcu *devices[MAX_CACHES_PER_SET]; + unsigned nr_devices_max; + unsigned next_alloc; + struct cache_group_entry *devices; }; struct cache_member_cpu { diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 1b49c44f92c4..ad97bb834f8a 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1537,6 +1537,7 @@ static void journal_reclaim_work(struct work_struct *work) * Advance last_idx to point to the oldest journal entry containing * btree node updates that have not yet been written out */ + mutex_lock(&c->cache_tiers[0].lock); group_for_each_cache(ca, &c->cache_tiers[0], iter) { struct journal_device *ja = &ca->journal; @@ -1584,6 +1585,7 @@ static void journal_reclaim_work(struct work_struct *work) ja->bucket_seq[bucket_to_flush]); spin_unlock(&j->lock); } + mutex_unlock(&c->cache_tiers[0].lock); if (reclaim_lock_held) mutex_unlock(&j->reclaim_lock); @@ -1634,7 +1636,10 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) * Determine location of the next journal write: * XXX: sort caches by free journal space */ - group_for_each_cache_rcu(ca, &c->cache_tiers[0], iter) { + + /* XXX: use devices from other tiers if insufficient tier 0 devices */ + mutex_lock(&c->cache_tiers[0].lock); + group_for_each_cache(ca, &c->cache_tiers[0], iter) { struct journal_device *ja = &ca->journal; unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb); @@ -1664,6 +1669,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx); } + mutex_unlock(&c->cache_tiers[0].lock); rcu_read_unlock(); spin_unlock(&j->lock); @@ -2264,7 +2270,8 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf) journal_entry_is_open(j), test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - group_for_each_cache_rcu(ca, &c->cache_tiers[0], iter) { + mutex_lock(&c->cache_tiers[0].lock); + group_for_each_cache(ca, &c->cache_tiers[0], iter) { struct journal_device *ja = &ca->journal; ret += scnprintf(buf + ret, PAGE_SIZE - ret, @@ -2276,6 +2283,7 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf) ja->cur_idx, ja->bucket_seq[ja->cur_idx], ja->last_idx, ja->bucket_seq[ja->last_idx]); } + mutex_unlock(&c->cache_tiers[0].lock); spin_unlock(&j->lock); rcu_read_unlock(); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index beb0587be4ce..1938bec2bbe1 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -146,7 +146,8 @@ static int bch_congested_fn(void *data, int bdi_bits) } } else { /* Writes only go to tier 0: */ - group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) { + mutex_lock(&c->cache_tiers[0].lock); + group_for_each_cache(ca, &c->cache_tiers[0], i) { bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); if (bdi_congested(bdi, bdi_bits)) { @@ -154,6 +155,7 @@ static int bch_congested_fn(void *data, int bdi_bits) break; } } + mutex_unlock(&c->cache_tiers[0].lock); } rcu_read_unlock(); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 24f9b0ead9c8..8ec9fcffabdf 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -59,40 +59,11 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c, (ca = bch_get_next_cache(c, &(iter))); \ percpu_ref_put(&ca->ref), (iter)++) -static inline struct cache *cache_group_next_rcu(struct cache_group *devs, - unsigned *iter) -{ - struct cache *ret = NULL; - - while (*iter < devs->nr_devices && - !(ret = rcu_dereference(devs->devices[*iter]))) - (*iter)++; - - return ret; -} - -#define group_for_each_cache_rcu(ca, devs, iter) \ - for ((iter) = 0; \ - ((ca) = cache_group_next_rcu((devs), &(iter))); \ - (iter)++) - -static inline struct cache *cache_group_next(struct cache_group *devs, - unsigned *iter) -{ - struct cache *ret; - - rcu_read_lock(); - if ((ret = cache_group_next_rcu(devs, iter))) - percpu_ref_get(&ret->ref); - rcu_read_unlock(); - - return ret; -} - #define group_for_each_cache(ca, devs, iter) \ - for ((iter) = 0; \ - (ca = cache_group_next(devs, &(iter))); \ - percpu_ref_put(&ca->ref), (iter)++) + for (({ lockdep_assert_held(&(devs)->lock); (iter) = 0; }); \ + (iter) < (devs)->nr_devices && \ + ((ca) = (devs)->devices[(iter)].dev, true); \ + (iter)++) void bch_check_mark_super_slowpath(struct cache_set *, const struct bkey_i *, bool); @@ -132,6 +103,7 @@ static inline bool bch_cache_may_remove(struct cache *ca) { struct cache_set *c = ca->set; struct cache_group *tier = &c->cache_tiers[ca->mi.tier]; + bool ret; /* * Right now, we can't remove the last device from a tier, @@ -150,8 +122,11 @@ static inline bool bch_cache_may_remove(struct cache *ca) * whole cache set RO. */ - return tier->nr_devices != 1 || - rcu_access_pointer(tier->devices[0]) != ca; + mutex_lock(&tier->lock); + ret = tier->nr_devices != 1 || tier->devices[0].dev != ca; + mutex_unlock(&tier->lock); + + return ret; } void free_super(struct bcache_superblock *); diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 9a0d89d6223c..05a778c0f0c9 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -388,9 +388,9 @@ void bch_bio_free_pages(struct bio *bio) __free_page(bv->bv_page); } -size_t bch_rand_range(size_t max) +u64 bch_rand_range(u64 max) { - size_t rand; + u64 rand; do { get_random_bytes(&rand, sizeof(rand)); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 97eea8892572..757a575e28bd 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -625,7 +625,7 @@ do { \ _ret; \ }) -size_t bch_rand_range(size_t); +u64 bch_rand_range(u64); void memcpy_to_bio(struct bio *, struct bvec_iter, void *); void memcpy_from_bio(void *, struct bio *, struct bvec_iter); |