diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-12-12 12:53:56 -0900 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-01-18 21:41:26 -0900 |
commit | 019162ee32535849042f7482e7cd57c7263864bd (patch) | |
tree | 50e0b62048d0ed806328355c9e20429059e577cf | |
parent | 6584f775bbc01fd14455272afafcd8c45e5b9801 (diff) |
bcache: short circuit bch_prio_write() and journal flush when possible
-rw-r--r-- | drivers/md/bcache/alloc.c | 130 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.h | 1 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 6 | ||||
-rw-r--r-- | drivers/md/bcache/btree_gc.c | 22 | ||||
-rw-r--r-- | drivers/md/bcache/btree_update.c | 8 | ||||
-rw-r--r-- | drivers/md/bcache/buckets.c | 196 | ||||
-rw-r--r-- | drivers/md/bcache/buckets.h | 38 | ||||
-rw-r--r-- | drivers/md/bcache/buckets_types.h | 28 | ||||
-rw-r--r-- | drivers/md/bcache/extents.c | 120 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 5 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 34 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 14 |
12 files changed, 318 insertions, 284 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index ac40a96b381e..0887ed3eccd1 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -264,7 +264,7 @@ static int bch_prio_write(struct cache *ca) g = ca->buckets + r; d->read_prio = cpu_to_le16(g->read_prio); d->write_prio = cpu_to_le16(g->write_prio); - d->gen = ca->bucket_gens[r]; + d->gen = ca->buckets[r].mark.gen; } p->next_bucket = cpu_to_le64(ca->prio_buckets[i + 1]); @@ -343,6 +343,7 @@ int bch_prio_read(struct cache *ca) struct cache_set *c = ca->set; struct prio_set *p = ca->disk_buckets; struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; + struct bucket_mark new; unsigned bucket_nr = 0; u64 bucket, expect, got; size_t b; @@ -398,8 +399,8 @@ int bch_prio_read(struct cache *ca) ca->buckets[b].read_prio = le16_to_cpu(d->read_prio); ca->buckets[b].write_prio = le16_to_cpu(d->write_prio); - ca->buckets[b].oldest_gen = d->gen; - ca->bucket_gens[b] = d->gen; + + bucket_cmpxchg(&ca->buckets[b], new, new.gen = d->gen); } return 0; @@ -586,31 +587,18 @@ static bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *g) return can_inc_bucket_gen(ca, g); } -static void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *g) +static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *g) { - lockdep_assert_held(&ca->freelist_lock); - - /* Ordering matters: see bch_mark_data_bucket() */ + spin_lock(&ca->freelist_lock); - /* bucket mark updates imply a write barrier */ - bch_mark_alloc_bucket(ca, g); + bch_invalidate_bucket(ca, g); g->read_prio = ca->set->prio_clock[READ].hand; g->write_prio = ca->set->prio_clock[WRITE].hand; - g->copygc_gen = 0; verify_not_on_freelist(ca, g - ca->buckets); -} - -static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *g) -{ - spin_lock(&ca->freelist_lock); - - /* this is what makes ptrs to the bucket invalid */ - ca->bucket_gens[g - ca->buckets]++; - - __bch_invalidate_one_bucket(ca, g); BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); + spin_unlock(&ca->freelist_lock); } @@ -779,6 +767,35 @@ static bool bch_allocator_push(struct cache *ca, long bucket) return ret; } +static void bch_find_empty_buckets(struct cache_set *c, struct cache *ca) +{ + u16 last_seq_ondisk = c->journal.last_seq_ondisk; + struct bucket *g; + + for_each_bucket(g, ca) { + struct bucket_mark m = READ_ONCE(g->mark); + + if (is_available_bucket(m) && + !m.cached_sectors && + (!m.wait_on_journal || + ((s16) last_seq_ondisk - (s16) m.journal_seq >= 0))) { + spin_lock(&ca->freelist_lock); + + bch_mark_alloc_bucket(ca, g, true); + g->read_prio = ca->set->prio_clock[READ].hand; + g->write_prio = ca->set->prio_clock[WRITE].hand; + + verify_not_on_freelist(ca, g - ca->buckets); + BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); + + spin_unlock(&ca->freelist_lock); + + if (fifo_full(&ca->free_inc)) + break; + } + } +} + /** * bch_allocator_thread - move buckets from free_inc to reserves * @@ -833,10 +850,21 @@ static int bch_allocator_thread(void *arg) __set_current_state(TASK_RUNNING); } - /* We've run out of free buckets! */ - down_read(&c->gc_lock); + /* + * See if we have buckets we can reuse without invalidating them + * or forcing a journal commit: + */ + bch_find_empty_buckets(c, ca); + + if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) { + up_read(&c->gc_lock); + continue; + } + + /* We've run out of free buckets! */ + while (!fifo_full(&ca->free_inc)) { if (wait_buckets_available(ca)) { up_read(&c->gc_lock); @@ -1044,7 +1072,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c, ob->nr_ptrs * sizeof(ob->ptr_offset[0])); ob->nr_ptrs++; ob->ptrs[0] = (struct bch_extent_ptr) { - .gen = ca->bucket_gens[bucket], + .gen = ca->buckets[bucket].mark.gen, .offset = bucket_to_sector(ca, bucket), .dev = ca->sb.nr_this_dev, }; @@ -1155,7 +1183,7 @@ static void __bch_open_bucket_put(struct cache_set *c, struct open_bucket *ob) rcu_read_lock(); open_bucket_for_each_online_device(c, ob, ptr, ca) - bch_unmark_open_bucket(ca, PTR_BUCKET(ca, ptr)); + bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false); rcu_read_unlock(); ob->nr_ptrs = 0; @@ -1746,8 +1774,6 @@ int bch_cache_allocator_start(struct cache *ca) /* * allocator thread already started? - * (run_cache_set() starts allocator separately from normal rw path, via - * bch_cache_allocator_start_once()) */ if (ca->alloc_thread) return 0; @@ -1773,58 +1799,6 @@ int bch_cache_allocator_start(struct cache *ca) return 0; } -/* - * bch_cache_allocator_start - fill freelists directly with completely unused - * buckets - * - * The allocator thread needs freed buckets to rewrite the prios and gens, and - * it needs to rewrite prios and gens in order to free buckets. - * - * Don't increment gens. We are only re-using completely free buckets here, so - * there are no existing pointers into them. - * - * Also, we can't increment gens until we re-write prios and gens, but we - * can't do that until we can write a journal entry. - * - * If the journal is completely full, we cannot write a journal entry until we - * reclaim a journal bucket, and we cannot do that until we possibly allocate - * some buckets for btree nodes. - * - * So dig ourselves out of that hole here. - * - * This is only safe for buckets that have no live data in them, which there - * should always be some of when this function is called, since the last time - * we shut down there should have been unused buckets stranded on freelists. - */ -const char *bch_cache_allocator_start_once(struct cache *ca) -{ - struct bucket *g; - - spin_lock(&ca->freelist_lock); - for_each_bucket(g, ca) { - if (fifo_full(&ca->free[RESERVE_NONE])) - break; - - if (bch_can_invalidate_bucket(ca, g) && - !g->mark.cached_sectors) { - __bch_invalidate_one_bucket(ca, g); - BUG_ON(!__bch_allocator_push(ca, g - ca->buckets)); - } - } - spin_unlock(&ca->freelist_lock); - - if (cache_set_init_fault("alloc_start")) - return "dynamic fault"; - - if (!fifo_full(&ca->free[RESERVE_PRIO])) - return "couldn't find enough available buckets to write prios"; - - if (bch_cache_allocator_start(ca)) - return "error starting allocator thread"; - - return NULL; -} - void bch_open_buckets_init(struct cache_set *c) { unsigned i; diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h index c39f43e02e80..46dbad44567a 100644 --- a/drivers/md/bcache/alloc.h +++ b/drivers/md/bcache/alloc.h @@ -91,7 +91,6 @@ static inline struct cache *cache_group_next(struct cache_group *devs, void bch_cache_allocator_stop(struct cache *); int bch_cache_allocator_start(struct cache *); -const char *bch_cache_allocator_start_once(struct cache *); void bch_open_buckets_init(struct cache_set *); #endif /* _BCACHE_ALLOC_H */ diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index e0c8716172e0..3540e05e40ef 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -407,7 +407,9 @@ struct cache { size_t fifo_last_bucket; /* Allocation stuff: */ - u8 *bucket_gens; + + /* most out of date gen in the btree */ + u8 *oldest_gens; struct bucket *buckets; unsigned short bucket_bits; /* ilog2(bucket_size) */ @@ -763,6 +765,8 @@ struct cache_set { struct journal journal; + unsigned bucket_journal_seq; + /* CACHING OTHER BLOCK DEVICES */ mempool_t search; struct radix_tree_root devices; diff --git a/drivers/md/bcache/btree_gc.c b/drivers/md/bcache/btree_gc.c index e34445da10f5..a00785189ffe 100644 --- a/drivers/md/bcache/btree_gc.c +++ b/drivers/md/bcache/btree_gc.c @@ -102,10 +102,10 @@ u8 bch_btree_key_recalc_oldest_gen(struct cache_set *c, struct bkey_s_c k) rcu_read_lock(); extent_for_each_online_device(c, e, ptr, ca) { - struct bucket *g = PTR_BUCKET(ca, ptr); + size_t b = PTR_BUCKET_NR(ca, ptr); - if (__gen_after(g->oldest_gen, ptr->gen)) - g->oldest_gen = ptr->gen; + if (__gen_after(ca->oldest_gens[b], ptr->gen)) + ca->oldest_gens[b] = ptr->gen; max_stale = max(max_stale, ptr_stale(ca, ptr)); } @@ -237,11 +237,11 @@ static void bch_mark_allocator_buckets(struct cache_set *c) spin_lock(&ca->freelist_lock); fifo_for_each_entry(i, &ca->free_inc, iter) - bch_mark_alloc_bucket(ca, &ca->buckets[i]); + bch_mark_alloc_bucket(ca, &ca->buckets[i], true); for (j = 0; j < RESERVE_NR; j++) fifo_for_each_entry(i, &ca->free[j], iter) - bch_mark_alloc_bucket(ca, &ca->buckets[i]); + bch_mark_alloc_bucket(ca, &ca->buckets[i], true); spin_unlock(&ca->freelist_lock); } @@ -254,7 +254,7 @@ static void bch_mark_allocator_buckets(struct cache_set *c) mutex_lock(&ob->lock); rcu_read_lock(); open_bucket_for_each_online_device(c, ob, ptr, ca) - bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr)); + bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), true); rcu_read_unlock(); mutex_unlock(&ob->lock); } @@ -317,6 +317,7 @@ void bch_gc(struct cache_set *c) { struct cache *ca; struct bucket *g; + struct bucket_mark new; u64 start_time = local_clock(); unsigned i; int cpu; @@ -385,8 +386,13 @@ void bch_gc(struct cache_set *c) /* Clear bucket marks: */ for_each_cache(ca, c, i) for_each_bucket(g, ca) { - g->oldest_gen = ca->bucket_gens[g - ca->buckets]; - atomic_set((atomic_t *) &g->mark.counter, 0); + bucket_cmpxchg(g, new, ({ + new.owned_by_allocator = 0; + new.is_metadata = 0; + new.cached_sectors = 0; + new.dirty_sectors = 0; + })); + ca->oldest_gens[g - ca->buckets] = new.gen; } /* Walk allocator's references: */ diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c index 4feaf02d8409..8b5c5b43d0f8 100644 --- a/drivers/md/bcache/btree_update.c +++ b/drivers/md/bcache/btree_update.c @@ -146,7 +146,7 @@ found: -c->sb.btree_node_size, true, b ? gc_pos_btree_node(b) : gc_pos_btree_root(id), - &tmp); + &tmp, 0); /* * Don't apply tmp - pending deletes aren't tracked in * cache_set_stats: @@ -215,7 +215,7 @@ static void bch_btree_node_free_ondisk(struct cache_set *c, bch_mark_key(c, bkey_i_to_s_c(&pending->key), -c->sb.btree_node_size, true, gc_phase(GC_PHASE_PENDING_DELETE), - &stats); + &stats, 0); /* * Don't apply stats - pending deletes aren't tracked in * cache_set_stats: @@ -375,7 +375,7 @@ static void bch_btree_set_root_inmem(struct cache_set *c, struct btree *b, bch_mark_key(c, bkey_i_to_s_c(&b->key), c->sb.btree_node_size, true, gc_pos_btree_root(b->btree_id), - &stats); + &stats, 0); if (old) bch_btree_node_free_index(c, NULL, old->btree_id, @@ -636,7 +636,7 @@ static void bch_insert_fixup_btree_ptr(struct btree_iter *iter, if (bkey_extent_is_data(&insert->k)) bch_mark_key(c, bkey_i_to_s_c(insert), c->sb.btree_node_size, true, - gc_pos_btree_node(b), &stats); + gc_pos_btree_node(b), &stats, 0); while ((k = bch_btree_node_iter_peek_all(node_iter, b)) && !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false)) diff --git a/drivers/md/bcache/buckets.c b/drivers/md/bcache/buckets.c index 254349450f13..90dfa03b0a7a 100644 --- a/drivers/md/bcache/buckets.c +++ b/drivers/md/bcache/buckets.c @@ -101,6 +101,27 @@ static void bch_cache_set_stats_verify(struct cache_set *c) {} #endif +void bch_bucket_seq_cleanup(struct cache_set *c) +{ + u16 last_seq_ondisk = c->journal.last_seq_ondisk; + struct cache *ca; + struct bucket *g; + struct bucket_mark m; + unsigned i; + + for_each_cache(ca, c, i) + for_each_bucket(g, ca) { + bucket_cmpxchg(g, m, ({ + if (!m.wait_on_journal || + ((s16) last_seq_ondisk - + (s16) m.journal_seq < 0)) + break; + + m.wait_on_journal = 0; + })); + } +} + #define bucket_stats_add(_acc, _stats) \ do { \ typeof(_acc) _a = (_acc), _s = (_stats); \ @@ -268,16 +289,23 @@ static void bucket_stats_update(struct cache *ca, bch_wake_allocator(ca); } -static struct bucket_mark bch_bucket_mark_set(struct cache *ca, - struct bucket *g, struct bucket_mark new, - bool may_make_unavailable) +void bch_invalidate_bucket(struct cache *ca, struct bucket *g) { struct bucket_stats_cache_set stats = { 0 }; - struct bucket_mark old; + struct bucket_mark old, new; - old.counter = xchg(&g->mark.counter, new.counter); + old = bucket_cmpxchg(g, new, ({ + new.owned_by_allocator = 1; + new.is_metadata = 0; + new.cached_sectors = 0; + new.dirty_sectors = 0; + new.copygc = 0; + new.gen++; + })); - bucket_stats_update(ca, old, new, may_make_unavailable, &stats); + BUG_ON(old.dirty_sectors); + + bucket_stats_update(ca, old, new, true, &stats); /* * Ick: @@ -293,55 +321,51 @@ static struct bucket_mark bch_bucket_mark_set(struct cache *ca, stats.s[S_UNCOMPRESSED][S_CACHED] = 0; BUG_ON(!bch_is_zero(&stats, sizeof(stats))); - return old; + if (!old.owned_by_allocator && old.cached_sectors) + trace_bcache_invalidate(ca, g - ca->buckets, + old.cached_sectors); } -#define bucket_cmpxchg(g, old, new, \ - may_make_unavailable, \ - cache_set_stats, expr) \ -do { \ - u32 _v = READ_ONCE((g)->mark.counter); \ - \ - do { \ - new.counter = old.counter = _v; \ - expr; \ - } while ((_v = cmpxchg(&(g)->mark.counter, \ - old.counter, \ - new.counter)) != old.counter); \ - bucket_stats_update(ca, old, new, \ - may_make_unavailable, \ - cache_set_stats); \ -} while (0) - void bch_mark_free_bucket(struct cache *ca, struct bucket *g) { - bch_bucket_mark_set(ca, g, - (struct bucket_mark) { .counter = 0 }, - false); + struct bucket_stats_cache_set stats = { 0 }; + struct bucket_mark old, new; + + old = bucket_cmpxchg(g, new, ({ + new.owned_by_allocator = 0; + new.is_metadata = 0; + new.cached_sectors = 0; + new.dirty_sectors = 0; + })); + + bucket_stats_update(ca, old, new, false, &stats); } -void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g) +void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g, + bool owned_by_allocator) { - struct bucket_mark old = bch_bucket_mark_set(ca, g, - (struct bucket_mark) { .owned_by_allocator = 1 }, - true); + struct bucket_stats_cache_set stats = { 0 }; + struct bucket_mark old, new; - BUG_ON(old.dirty_sectors); + old = bucket_cmpxchg(g, new, new.owned_by_allocator = owned_by_allocator); - if (!old.owned_by_allocator && old.cached_sectors) - trace_bcache_invalidate(ca, g - ca->buckets, - old.cached_sectors); + bucket_stats_update(ca, old, new, true, &stats); } void bch_mark_metadata_bucket(struct cache *ca, struct bucket *g, bool may_make_unavailable) { - struct bucket_mark old = bch_bucket_mark_set(ca, g, - (struct bucket_mark) { .is_metadata = 1 }, - may_make_unavailable); + struct bucket_stats_cache_set stats = { 0 }; + struct bucket_mark old, new; + + old = bucket_cmpxchg(g, new, ({ + new.is_metadata = 1; + })); BUG_ON(old.cached_sectors); BUG_ON(old.dirty_sectors); + + bucket_stats_update(ca, old, new, may_make_unavailable, &stats); } #define saturated_add(ca, dst, src, max) \ @@ -398,12 +422,12 @@ static void bch_mark_pointer(struct cache_set *c, s64 sectors, enum s_alloc type, bool may_make_unavailable, struct bucket_stats_cache_set *stats, - bool is_gc, struct gc_pos gc_pos) + bool gc_will_visit, u64 journal_seq) { struct bucket_mark old, new; unsigned saturated; struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr); - u32 v = READ_ONCE(g->mark.counter); + u64 v = READ_ONCE(g->_mark.counter); unsigned old_sectors, new_sectors; int disk_sectors, compressed_sectors; @@ -420,36 +444,28 @@ static void bch_mark_pointer(struct cache_set *c, compressed_sectors = -__compressed_sectors(crc, old_sectors) + __compressed_sectors(crc, new_sectors); + if (gc_will_visit) { + if (journal_seq) + bucket_cmpxchg(g, new, new.journal_seq = journal_seq); + + goto out; + } + do { new.counter = old.counter = v; saturated = 0; - /* - * cmpxchg() only implies a full barrier on success, not - * failure, so we need a read barrier on all iterations - - * between reading the mark and checking pointer validity/gc - * status - */ - smp_rmb(); /* * Check this after reading bucket mark to guard against * the allocator invalidating a bucket after we've already * checked the gen */ - if (ptr_stale(ca, ptr)) { + if (gen_after(old.gen, ptr->gen)) { EBUG_ON(type != S_CACHED && test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); return; } - /* - * Check this after reading bucket mark to guard against - * GC starting between when we check gc_cur_key and when - * the GC zeroes out marks - */ - if (!is_gc && gc_will_visit(c, gc_pos)) - goto out; - EBUG_ON(type != S_CACHED && !may_make_unavailable && is_available_bucket(old) && @@ -472,11 +488,17 @@ static void bch_mark_pointer(struct cache_set *c, GC_MAX_SECTORS_USED); if (!new.dirty_sectors && - !new.cached_sectors) + !new.cached_sectors) { new.is_metadata = false; - else + + if (journal_seq) { + new.wait_on_journal = true; + new.journal_seq = journal_seq; + } + } else { new.is_metadata = (type == S_META); - } while ((v = cmpxchg(&g->mark.counter, + } + } while ((v = cmpxchg(&g->_mark.counter, old.counter, new.counter)) != old.counter); @@ -500,7 +522,7 @@ static void bch_mark_extent(struct cache_set *c, struct bkey_s_c_extent e, s64 sectors, bool metadata, bool may_make_unavailable, struct bucket_stats_cache_set *stats, - bool is_gc, struct gc_pos gc_pos) + bool gc_will_visit, u64 journal_seq) { const struct bch_extent_ptr *ptr; const union bch_extent_crc *crc; @@ -519,7 +541,7 @@ static void bch_mark_extent(struct cache_set *c, struct bkey_s_c_extent e, bch_mark_pointer(c, e, ca, crc, ptr, sectors, dirty ? type : S_CACHED, may_make_unavailable, - stats, is_gc, gc_pos); + stats, gc_will_visit, journal_seq); } rcu_read_unlock(); } @@ -528,13 +550,14 @@ static void __bch_mark_key(struct cache_set *c, struct bkey_s_c k, s64 sectors, bool metadata, bool may_make_unavailable, struct bucket_stats_cache_set *stats, - bool is_gc, struct gc_pos gc_pos) + bool gc_will_visit, u64 journal_seq) { switch (k.k->type) { case BCH_EXTENT: case BCH_EXTENT_CACHED: bch_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata, - may_make_unavailable, stats, is_gc, gc_pos); + may_make_unavailable, stats, + gc_will_visit, journal_seq); break; case BCH_RESERVATION: stats->persistent_reserved += sectors; @@ -546,7 +569,7 @@ void __bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k, s64 sectors, bool metadata, struct bucket_stats_cache_set *stats) { - __bch_mark_key(c, k, sectors, metadata, true, stats, true, GC_POS_MIN); + __bch_mark_key(c, k, sectors, metadata, true, stats, false, 0); } void bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k, @@ -563,28 +586,43 @@ void bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k, void bch_mark_key(struct cache_set *c, struct bkey_s_c k, s64 sectors, bool metadata, struct gc_pos gc_pos, - struct bucket_stats_cache_set *stats) + struct bucket_stats_cache_set *stats, u64 journal_seq) { + /* + * synchronization w.r.t. GC: + * + * Normally, bucket sector counts/marks are updated on the fly, as + * references are added/removed from the btree, the lists of buckets the + * allocator owns, other metadata buckets, etc. + * + * When GC is in progress and going to mark this reference, we do _not_ + * mark this reference here, to avoid double counting - GC will count it + * when it gets to it. + * + * To know whether we should mark a given reference (GC either isn't + * running, or has already marked references at this position) we + * construct a total order for everything GC walks. Then, we can simply + * compare the position of the reference we're marking - @gc_pos - with + * GC's current position. If GC is going to mark this reference, GC's + * current position will be less than @gc_pos; if GC's current position + * is greater than @gc_pos GC has either already walked this position, + * or isn't running. + * + * To avoid racing with GC's position changing, we have to deal with + * - GC's position being set to GC_POS_MIN when GC starts: + * bucket_stats_lock guards against this + * - GC's position overtaking @gc_pos: we guard against this with + * whatever lock protects the data structure the reference lives in + * (e.g. the btree node lock, or the relevant allocator lock). + */ lg_local_lock(&c->bucket_stats_lock); - __bch_mark_key(c, k, sectors, metadata, false, stats, false, gc_pos); + __bch_mark_key(c, k, sectors, metadata, false, stats, + gc_will_visit(c, gc_pos), journal_seq); bch_cache_set_stats_verify(c); lg_local_unlock(&c->bucket_stats_lock); } -void bch_unmark_open_bucket(struct cache *ca, struct bucket *g) -{ - struct bucket_stats_cache_set stats = { 0 }; - struct bucket_mark old, new; - - bucket_cmpxchg(g, old, new, false, NULL, ({ - new.owned_by_allocator = 0; - })); - - /* owned_by_allocator buckets aren't tracked in cache_set_stats: */ - BUG_ON(!bch_is_zero(&stats, sizeof(stats))); -} - static u64 __recalc_sectors_available(struct cache_set *c) { return c->capacity - cache_set_sectors_used(c); diff --git a/drivers/md/bcache/buckets.h b/drivers/md/bcache/buckets.h index 277085e152eb..35100eba351e 100644 --- a/drivers/md/bcache/buckets.h +++ b/drivers/md/bcache/buckets.h @@ -10,10 +10,24 @@ #include "buckets_types.h" #include "super.h" -#define for_each_bucket(b, ca) \ - for (b = (ca)->buckets + (ca)->mi.first_bucket; \ +#define for_each_bucket(b, ca) \ + for (b = (ca)->buckets + (ca)->mi.first_bucket; \ b < (ca)->buckets + (ca)->mi.nbuckets; b++) +#define bucket_cmpxchg(g, new, expr) \ +({ \ + u64 _v = READ_ONCE((g)->_mark.counter); \ + struct bucket_mark _old; \ + \ + do { \ + (new).counter = _old.counter = _v; \ + expr; \ + } while ((_v = cmpxchg(&(g)->_mark.counter, \ + _old.counter, \ + (new).counter)) != _old.counter);\ + _old; \ +}) + /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree. @@ -22,7 +36,7 @@ static inline u8 bucket_gc_gen(struct cache *ca, struct bucket *g) { unsigned long r = g - ca->buckets; - return ca->bucket_gens[r] - ca->buckets[r].oldest_gen; + return g->mark.gen - ca->oldest_gens[r]; } static inline struct cache *PTR_CACHE(const struct cache_set *c, @@ -63,13 +77,7 @@ static inline size_t PTR_BUCKET_NR_TRACE(const struct cache_set *c, return bucket; } -static inline u8 PTR_BUCKET_GEN(const struct cache *ca, - const struct bch_extent_ptr *ptr) -{ - return ca->bucket_gens[PTR_BUCKET_NR(ca, ptr)]; -} - -static inline struct bucket *PTR_BUCKET(struct cache *ca, +static inline struct bucket *PTR_BUCKET(const struct cache *ca, const struct bch_extent_ptr *ptr) { return ca->buckets + PTR_BUCKET_NR(ca, ptr); @@ -100,7 +108,7 @@ static inline u8 gen_after(u8 a, u8 b) static inline u8 ptr_stale(const struct cache *ca, const struct bch_extent_ptr *ptr) { - return gen_after(PTR_BUCKET_GEN(ca, ptr), ptr->gen); + return gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen); } /* bucket heaps */ @@ -231,16 +239,18 @@ static inline bool is_available_bucket(struct bucket_mark mark) !mark.dirty_sectors); } +void bch_bucket_seq_cleanup(struct cache_set *); + +void bch_invalidate_bucket(struct cache *, struct bucket *); void bch_mark_free_bucket(struct cache *, struct bucket *); -void bch_mark_alloc_bucket(struct cache *, struct bucket *); +void bch_mark_alloc_bucket(struct cache *, struct bucket *, bool); void bch_mark_metadata_bucket(struct cache *, struct bucket *, bool); -void bch_unmark_open_bucket(struct cache *, struct bucket *); void __bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool, struct bucket_stats_cache_set *); void bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool); void bch_mark_key(struct cache_set *, struct bkey_s_c, s64, bool, - struct gc_pos, struct bucket_stats_cache_set *); + struct gc_pos, struct bucket_stats_cache_set *, u64); void bch_recalc_sectors_available(struct cache_set *); diff --git a/drivers/md/bcache/buckets_types.h b/drivers/md/bcache/buckets_types.h index 7712ff2b6d1a..90bb09c6db35 100644 --- a/drivers/md/bcache/buckets_types.h +++ b/drivers/md/bcache/buckets_types.h @@ -4,14 +4,27 @@ struct bucket_mark { union { struct { - u32 counter; + u64 counter; }; struct { + u8 gen; + + /* generation copygc is going to move this bucket into */ + unsigned copygc:1; + unsigned wait_on_journal:1; + unsigned owned_by_allocator:1; - unsigned cached_sectors:15; unsigned is_metadata:1; - unsigned dirty_sectors:15; + + u16 cached_sectors; + u16 dirty_sectors; + + /* + * low bits of journal sequence number when this bucket was most + * recently modified: + */ + u16 journal_seq; }; }; }; @@ -24,12 +37,11 @@ struct bucket { }; u16 prio[2]; }; - struct bucket_mark mark; - /* Most out of date gen in the btree */ - u8 oldest_gen; - /* generation copygc is going to move this bucket into */ - u8 copygc_gen; + union { + struct bucket_mark _mark; + const struct bucket_mark mark; + }; }; struct bucket_stats_cache { diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 288603bb5e3c..a0d8fa425930 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -511,8 +511,9 @@ err: cache_set_bug(c, "%s btree pointer %s: bucket %zi prio %i " "gen %i last_gc %i mark %08x", err, buf, PTR_BUCKET_NR(ca, ptr), - g->read_prio, PTR_BUCKET_GEN(ca, ptr), - g->oldest_gen, g->mark.counter); + g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen, + ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], + (unsigned) g->mark.counter); rcu_read_unlock(); } @@ -862,61 +863,65 @@ struct btree_nr_keys bch_extent_sort_fix_overlapping(struct cache_set *c, return nr; } -static void bch_add_sectors(struct btree_iter *iter, struct bkey_s_c k, - u64 offset, s64 sectors, - struct bucket_stats_cache_set *stats) +struct extent_insert_state { + struct btree_insert *trans; + struct btree_insert_entry *insert; + struct bpos committed; + struct bucket_stats_cache_set stats; + + /* for deleting: */ + struct bkey_i whiteout; + bool do_journal; + bool deleting; +}; + +static void bch_add_sectors(struct extent_insert_state *s, + struct bkey_s_c k, u64 offset, s64 sectors) { - struct cache_set *c = iter->c; - struct btree *b = iter->nodes[0]; + struct cache_set *c = s->trans->c; + struct btree *b = s->insert->iter->nodes[0]; - EBUG_ON(iter->level); EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0); if (!sectors) return; - bch_mark_key(c, k, sectors, false, gc_pos_btree_node(b), stats); + bch_mark_key(c, k, sectors, false, gc_pos_btree_node(b), + &s->stats, s->trans->journal_res.seq); if (bkey_extent_is_data(k.k) && !bkey_extent_is_cached(k.k)) bcache_dev_sectors_dirty_add(c, k.k->p.inode, offset, sectors); } -static void bch_subtract_sectors(struct btree_iter *iter, struct bkey_s_c k, - u64 offset, s64 sectors, - struct bucket_stats_cache_set *stats) +static void bch_subtract_sectors(struct extent_insert_state *s, + struct bkey_s_c k, u64 offset, s64 sectors) { - bch_add_sectors(iter, k, offset, -sectors, stats); + bch_add_sectors(s, k, offset, -sectors); } /* These wrappers subtract exactly the sectors that we're removing from @k */ -static void bch_cut_subtract_back(struct btree_iter *iter, - struct bpos where, struct bkey_s k, - struct bucket_stats_cache_set *stats) +static void bch_cut_subtract_back(struct extent_insert_state *s, + struct bpos where, struct bkey_s k) { - bch_subtract_sectors(iter, k.s_c, where.offset, - k.k->p.offset - where.offset, - stats); + bch_subtract_sectors(s, k.s_c, where.offset, + k.k->p.offset - where.offset); bch_cut_back(where, k.k); } -static void bch_cut_subtract_front(struct btree_iter *iter, - struct bpos where, struct bkey_s k, - struct bucket_stats_cache_set *stats) +static void bch_cut_subtract_front(struct extent_insert_state *s, + struct bpos where, struct bkey_s k) { - bch_subtract_sectors(iter, k.s_c, bkey_start_offset(k.k), - where.offset - bkey_start_offset(k.k), - stats); + bch_subtract_sectors(s, k.s_c, bkey_start_offset(k.k), + where.offset - bkey_start_offset(k.k)); __bch_cut_front(where, k); } -static void bch_drop_subtract(struct btree_iter *iter, struct bkey_s k, - struct bucket_stats_cache_set *stats) +static void bch_drop_subtract(struct extent_insert_state *s, struct bkey_s k) { if (k.k->size) - bch_subtract_sectors(iter, k.s_c, - bkey_start_offset(k.k), k.k->size, - stats); + bch_subtract_sectors(s, k.s_c, + bkey_start_offset(k.k), k.k->size); k.k->size = 0; __set_bkey_deleted(k.k); } @@ -1041,18 +1046,6 @@ static bool bch_extent_merge_inline(struct cache_set *, #define MAX_LOCK_HOLD_TIME (5 * NSEC_PER_MSEC) -struct extent_insert_state { - struct btree_insert *trans; - struct btree_insert_entry *insert; - struct bpos committed; - struct bucket_stats_cache_set stats; - - /* for deleting: */ - struct bkey_i whiteout; - bool do_journal; - bool deleting; -}; - static enum btree_insert_ret extent_insert_should_stop(struct extent_insert_state *s) { @@ -1146,12 +1139,12 @@ static void extent_insert_committed(struct extent_insert_state *s) bkey_cmp(s->committed, insert->k.p) && bkey_extent_is_compressed(c, bkey_i_to_s_c(insert))) { /* XXX: possibly need to increase our reservation? */ - bch_cut_subtract_back(iter, s->committed, - bkey_i_to_s(&split.k), &s->stats); + bch_cut_subtract_back(s, s->committed, + bkey_i_to_s(&split.k)); bch_cut_front(s->committed, insert); - bch_add_sectors(iter, bkey_i_to_s_c(insert), + bch_add_sectors(s, bkey_i_to_s_c(insert), bkey_start_offset(&insert->k), - insert->k.size, &s->stats); + insert->k.size); } else { bch_cut_back(s->committed, &split.k.k); bch_cut_front(s->committed, insert); @@ -1197,8 +1190,7 @@ __extent_insert_advance_pos(struct extent_insert_state *s, break; case BTREE_HOOK_NO_INSERT: extent_insert_committed(s); - bch_cut_subtract_front(s->insert->iter, next_pos, - bkey_i_to_s(s->insert->k), &s->stats); + bch_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k)); bch_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos); break; @@ -1296,16 +1288,14 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, switch (overlap) { case BCH_EXTENT_OVERLAP_FRONT: /* insert overlaps with start of k: */ - bch_cut_subtract_front(iter, insert->k.p, k, &s->stats); + bch_cut_subtract_front(s, insert->k.p, k); BUG_ON(bkey_deleted(k.k)); extent_save(b, node_iter, _k, k.k); break; case BCH_EXTENT_OVERLAP_BACK: /* insert overlaps with end of k: */ - bch_cut_subtract_back(iter, - bkey_start_pos(&insert->k), - k, &s->stats); + bch_cut_subtract_back(s, bkey_start_pos(&insert->k), k); BUG_ON(bkey_deleted(k.k)); extent_save(b, node_iter, _k, k.k); @@ -1327,7 +1317,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, btree_keys_account_key_drop(&b->nr, t - b->set, _k); - bch_drop_subtract(iter, k, &s->stats); + bch_drop_subtract(s, k); k.k->p = bkey_start_pos(&insert->k); if (!__extent_save(b, node_iter, _k, k.k)) { /* @@ -1381,13 +1371,13 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, bch_cut_back(bkey_start_pos(&insert->k), &split.k.k); BUG_ON(bkey_deleted(&split.k.k)); - bch_cut_subtract_front(iter, insert->k.p, k, &s->stats); + bch_cut_subtract_front(s, insert->k.p, k); BUG_ON(bkey_deleted(k.k)); extent_save(b, node_iter, _k, k.k); - bch_add_sectors(iter, bkey_i_to_s_c(&split.k), + bch_add_sectors(s, bkey_i_to_s_c(&split.k), bkey_start_offset(&split.k.k), - split.k.k.size, &s->stats); + split.k.k.size); extent_bset_insert(c, iter, &split.k); break; } @@ -1452,9 +1442,8 @@ bch_delete_fixup_extent(struct extent_insert_state *s) if (overlap == BCH_EXTENT_OVERLAP_ALL) { btree_keys_account_key_drop(&b->nr, t - b->set, _k); - bch_subtract_sectors(iter, k.s_c, - bkey_start_offset(k.k), k.k->size, - &s->stats); + bch_subtract_sectors(s, k.s_c, + bkey_start_offset(k.k), k.k->size); _k->type = KEY_TYPE_DISCARD; reserve_whiteout(b, t, _k); } else if (k.k->needs_whiteout || @@ -1583,9 +1572,9 @@ bch_insert_fixup_extent(struct btree_insert *trans, EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) - bch_add_sectors(iter, bkey_i_to_s_c(insert->k), + bch_add_sectors(&s, bkey_i_to_s_c(insert->k), bkey_start_offset(&insert->k->k), - insert->k->k.size, &s.stats); + insert->k->k.size); while (bkey_cmp(s.committed, insert->k->k.p) < 0 && (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK && @@ -1652,9 +1641,9 @@ stop: */ if (insert->k->k.size && !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) - bch_subtract_sectors(iter, bkey_i_to_s_c(insert->k), + bch_subtract_sectors(&s, bkey_i_to_s_c(insert->k), bkey_start_offset(&insert->k->k), - insert->k->k.size, &s.stats); + insert->k->k.size); bch_cache_set_stats_apply(c, &s.stats, trans->disk_res, gc_pos_btree_node(b)); @@ -1861,8 +1850,9 @@ bad_ptr: cache_set_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i " "gen %i last_gc %i mark 0x%08x", buf, PTR_BUCKET_NR(ca, ptr), - g->read_prio, PTR_BUCKET_GEN(ca, ptr), - g->oldest_gen, g->mark.counter); + g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen, + ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], + (unsigned) g->mark.counter); cache_member_info_put(); return; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 28a79a1a0e3f..3a30b102c093 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -1077,6 +1077,11 @@ static enum { cancel_delayed_work(&j->write_work); spin_unlock(&j->lock); + if (c->bucket_journal_seq > 1 << 14) { + c->bucket_journal_seq = 0; + bch_bucket_seq_cleanup(c); + } + /* ugh - might be called from __journal_res_get() under wait_event() */ __set_current_state(TASK_RUNNING); bch_journal_buf_put(j, old.idx, need_write_just_set); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 3da05da54c98..917fd6ff9e00 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -21,24 +21,21 @@ /* Moving GC - IO loop */ -static bool moving_pred(struct cache *ca, struct bkey_s_c k) +static const struct bch_extent_ptr *moving_pred(struct cache *ca, + struct bkey_s_c k) { - struct cache_set *c = ca->set; const struct bch_extent_ptr *ptr; - bool ret = false; if (bkey_extent_is_data(k.k)) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - rcu_read_lock(); extent_for_each_ptr(e, ptr) - if (PTR_CACHE(c, ptr) == ca && - PTR_BUCKET(ca, ptr)->copygc_gen) - ret = true; - rcu_read_unlock(); + if ((ca->sb.nr_this_dev == ptr->dev) && + PTR_BUCKET(ca, ptr)->mark.copygc) + return ptr; } - return ret; + return NULL; } static int issue_moving_gc_move(struct cache *ca, @@ -49,14 +46,10 @@ static int issue_moving_gc_move(struct cache *ca, const struct bch_extent_ptr *ptr; int ret; - extent_for_each_ptr(bkey_s_c_to_extent(k), ptr) - if ((ca->sb.nr_this_dev == ptr->dev) && - PTR_BUCKET(ca, ptr)->copygc_gen) - goto found; + ptr = moving_pred(ca, k); + if (!ptr) /* We raced - bucket's been reused */ + return 0; - /* We raced - bucket's been reused */ - return 0; -found: ret = bch_data_move(c, ctxt, &ca->copygc_write_point, k, ptr); if (!ret) trace_bcache_gc_copy(k.k); @@ -116,7 +109,7 @@ next: /* don't check this if we bailed out early: */ for_each_bucket(g, ca) - if (g->copygc_gen && bucket_sectors_used(g)) { + if (g->mark.copygc && bucket_sectors_used(g)) { sectors_not_moved += bucket_sectors_used(g); buckets_not_moved++; } @@ -149,6 +142,7 @@ static void bch_moving_gc(struct cache *ca) { struct cache_set *c = ca->set; struct bucket *g; + struct bucket_mark new; u64 sectors_to_move; size_t buckets_to_move, buckets_unused = 0; struct bucket_heap_entry e; @@ -182,7 +176,7 @@ static void bch_moving_gc(struct cache *ca) /* * We need bucket marks to be up to date, so gc can't be recalculating * them, and we don't want the allocator invalidating a bucket after - * we've decided to evacuate it but before we set copygc_gen: + * we've decided to evacuate it but before we set copygc: */ down_read(&c->gc_lock); mutex_lock(&ca->heap_lock); @@ -190,7 +184,7 @@ static void bch_moving_gc(struct cache *ca) ca->heap.used = 0; for_each_bucket(g, ca) { - g->copygc_gen = 0; + bucket_cmpxchg(g, new, new.copygc = 0); if (bucket_unused(g)) { buckets_unused++; @@ -219,7 +213,7 @@ static void bch_moving_gc(struct cache *ca) } for (i = 0; i < ca->heap.used; i++) - ca->heap.data[i].g->copygc_gen = 1; + bucket_cmpxchg(ca->heap.data[i].g, new, new.copygc = 1); buckets_to_move = ca->heap.used; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1aba0a698755..93a97114a10e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1341,9 +1341,10 @@ static const char *run_cache_set(struct cache_set *c) */ bch_journal_start(c); + err = "error starting allocator thread"; for_each_cache(ca, c, i) if (ca->mi.state == CACHE_ACTIVE && - (err = bch_cache_allocator_start_once(ca))) { + bch_cache_allocator_start(ca)) { percpu_ref_put(&ca->ref); goto err; } @@ -1406,9 +1407,10 @@ static const char *run_cache_set(struct cache_set *c) bch_journal_start(c); bch_journal_set_replay_done(&c->journal); + err = "error starting allocator thread"; for_each_cache(ca, c, i) if (ca->mi.state == CACHE_ACTIVE && - (err = bch_cache_allocator_start_once(ca))) { + bch_cache_allocator_start(ca)) { percpu_ref_put(&ca->ref); goto err; } @@ -1709,7 +1711,7 @@ static void bch_cache_free_work(struct work_struct *work) kfree(ca->bio_prio); kfree(ca->journal.bio); vfree(ca->buckets); - vfree(ca->bucket_gens); + vfree(ca->oldest_gens); free_heap(&ca->heap); free_fifo(&ca->free_inc); @@ -1979,7 +1981,7 @@ static const char *cache_alloc(struct bcache_superblock *sb, !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) || !init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) || !init_heap(&ca->heap, heap_size, GFP_KERNEL) || - !(ca->bucket_gens = vzalloc(sizeof(u8) * + !(ca->oldest_gens = vzalloc(sizeof(u8) * ca->mi.nbuckets)) || !(ca->buckets = vzalloc(sizeof(struct bucket) * ca->mi.nbuckets)) || @@ -2211,8 +2213,8 @@ have_slot: bch_notify_cache_added(ca); if (ca->mi.state == CACHE_ACTIVE) { - err = bch_cache_allocator_start_once(ca); - if (err) + err = "error starting allocator thread"; + if (bch_cache_allocator_start(ca)) goto err_put; err = __bch_cache_read_write(ca); |