diff options
-rw-r--r-- | drivers/md/bcache/journal.c | 267 | ||||
-rw-r--r-- | drivers/md/bcache/journal_types.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 6 | ||||
-rw-r--r-- | drivers/md/bcache/super.h | 4 |
4 files changed, 169 insertions, 110 deletions
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index cce103a8c59f..18a20a5d9dd8 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -111,12 +111,21 @@ struct journal_list { int ret; }; -static int journal_add_entry(struct journal_list *jlist, struct jset *j) +/* + * Given a journal entry we just read, add it to the list of journal entries to + * be replayed: + */ +static enum { + JOURNAL_ENTRY_ADD_ERROR, + JOURNAL_ENTRY_ADD_OUT_OF_RANGE, + JOURNAL_ENTRY_ADD_OK, + +} journal_entry_add(struct journal_list *jlist, struct jset *j) { struct journal_replay *i, *pos; struct list_head *where; size_t bytes = set_bytes(j); - int ret = 0; + int ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; mutex_lock(&jlist->lock); @@ -130,7 +139,7 @@ static int journal_add_entry(struct journal_list *jlist, struct jset *j) } } - ret = 1; + ret = JOURNAL_ENTRY_ADD_OK; /* Drop entries we don't need anymore */ list_for_each_entry_safe(i, pos, jlist->head, list) { @@ -157,7 +166,7 @@ static int journal_add_entry(struct journal_list *jlist, struct jset *j) add: i = kmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) { - ret = -ENOMEM; + ret = JOURNAL_ENTRY_ADD_ERROR; goto out; } @@ -170,16 +179,71 @@ out: return ret; } +static enum { + JOURNAL_ENTRY_BAD, + JOURNAL_ENTRY_REREAD, + JOURNAL_ENTRY_OK, +} journal_entry_validate(struct cache *ca, const struct jset *j, u64 sector, + unsigned bucket_sectors_left, unsigned sectors_read) +{ + size_t bytes = set_bytes(j); + u64 got, expect; + + if (bch_meta_read_fault("journal")) + return JOURNAL_ENTRY_BAD; + + if (j->magic != jset_magic(&ca->set->sb)) { + pr_debug("bad magic while reading journal from %llu", sector); + return JOURNAL_ENTRY_BAD; + } + + got = j->version; + expect = BCACHE_JSET_VERSION; + if (got != expect) { + __bch_cache_error(ca, + "bad journal version (got %llu expect %llu) sector %lluu", + got, expect, sector); + return JOURNAL_ENTRY_BAD; + } + + if (bytes > bucket_sectors_left << 9 || + bytes > PAGE_SIZE << JSET_BITS) { + __bch_cache_error(ca, + "journal entry too big (%zu bytes), sector %lluu", + bytes, sector); + return JOURNAL_ENTRY_BAD; + } + + if (bytes > sectors_read << 9) + return JOURNAL_ENTRY_REREAD; + + got = j->csum; + expect = csum_set(j, JSET_CSUM_TYPE(j)); + if (got != expect) { + __bch_cache_error(ca, + "journal checksum bad (got %llu expect %llu), sector %lluu", + got, expect, sector); + return JOURNAL_ENTRY_BAD; + } + + if (j->last_seq > j->seq) { + __bch_cache_error(ca, + "invalid journal entry: last_seq > seq"); + return JOURNAL_ENTRY_BAD; + } + + return JOURNAL_ENTRY_OK; +} + static int journal_read_bucket(struct cache *ca, struct journal_list *jlist, - unsigned bucket_index, u64 *seq) + unsigned bucket, u64 *seq) { struct cache_set *c = ca->set; struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio; struct jset *j, *data; - unsigned len, left, offset = 0; - sector_t bucket = bucket_to_sector(ca, - journal_bucket(ca, bucket_index)); + unsigned blocks, sectors_read, bucket_offset = 0; + u64 sector = bucket_to_sector(ca, journal_bucket(ca, bucket)); bool entries_found = false; int ret = 0; @@ -189,16 +253,18 @@ static int journal_read_bucket(struct cache *ca, struct journal_list *jlist, data = c->journal.w[0].data; } - pr_debug("reading %u", bucket_index); + pr_debug("reading %u", bucket); - while (offset < ca->mi.bucket_size) { -reread: left = ca->mi.bucket_size - offset; - len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); + while (bucket_offset < ca->mi.bucket_size) { +reread: + sectors_read = min_t(unsigned, + ca->mi.bucket_size - bucket_offset, + PAGE_SECTORS << JSET_BITS); bio_reset(bio); bio->bi_bdev = ca->disk_sb.bdev; - bio->bi_iter.bi_sector = bucket + offset; - bio->bi_iter.bi_size = len << 9; + bio->bi_iter.bi_sector = sector + bucket_offset; + bio->bi_iter.bi_size = sectors_read << 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); bch_bio_map(bio, data); @@ -207,8 +273,8 @@ reread: left = ca->mi.bucket_size - offset; ret = -EIO; if (ret) { __bch_cache_error(ca, - "IO error %d reading journal from offset %zu", - ret, bucket + offset); + "IO error %d reading journal from bucket_offset %llu", + ret, sector + bucket_offset); goto err; } @@ -219,59 +285,41 @@ reread: left = ca->mi.bucket_size - offset; */ j = data; - while (len) { - size_t blocks, bytes = set_bytes(j); - u64 got, expect; - - if (bch_meta_read_fault("journal")) - goto err; - - if (j->magic != jset_magic(&c->sb)) { - pr_debug("%u: bad magic", bucket_index); - goto err; - } - - got = j->version; - expect = BCACHE_JSET_VERSION; - if (got != expect) { - __bch_cache_error(ca, - "bad version (got %llu expect %llu) while reading journal from offset %zu", - got, expect, bucket + offset); - goto err; - } - - if (bytes > left << 9 || - bytes > PAGE_SIZE << JSET_BITS) { - __bch_cache_error(ca, - "too big (%zu bytes) while reading journal from offset %zu", - bytes, bucket + offset); + while (sectors_read) { + switch (journal_entry_validate(ca, j, + sector + bucket_offset, + ca->mi.bucket_size - bucket_offset, + sectors_read)) { + case JOURNAL_ENTRY_BAD: + /* XXX: don't skip rest of bucket if single + * checksum error */ goto err; - } - - if (bytes > len << 9) + case JOURNAL_ENTRY_REREAD: goto reread; - - got = j->csum; - expect = csum_set(j, JSET_CSUM_TYPE(j)); - if (got != expect) { - __bch_cache_error(ca, - "bad checksum (got %llu expect %llu) while reading journal from offset %zu", - got, expect, bucket + offset); - goto err; + case JOURNAL_ENTRY_OK: + break; } - if (j->last_seq > j->seq) { - __bch_cache_error(ca, - "invalid journal entry: last_seq > seq"); - goto err; - } + /* + * This happens sometimes if we don't have discards on - + * when we've partially overwritten a bucket with new + * journal entries. We don't need the rest of the + * bucket: + */ + if (j->seq < ja->bucket_seq[bucket]) + goto out; - ret = journal_add_entry(jlist, j); - if (ret < 0) + ja->bucket_seq[bucket] = j->seq; + + switch (journal_entry_add(jlist, j)) { + case JOURNAL_ENTRY_ADD_ERROR: + ret = -ENOMEM; goto err; - if (ret) { - ja->seq[bucket_index] = j->seq; + case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: + break; + case JOURNAL_ENTRY_ADD_OK: entries_found = true; + break; } if (j->seq > *seq) @@ -280,12 +328,12 @@ reread: left = ca->mi.bucket_size - offset; blocks = set_blocks(j, block_bytes(c)); pr_debug("next"); - offset += blocks * ca->sb.block_size; - len -= blocks * ca->sb.block_size; + bucket_offset += blocks * ca->sb.block_size; + sectors_read -= blocks * ca->sb.block_size; j = ((void *) j) + blocks * block_bytes(ca); } } - +out: ret = entries_found; err: if (data == c->journal.w[0].data) @@ -316,19 +364,32 @@ static void bch_journal_read_device(struct closure *cl) struct cache *ca = container_of(ja, struct cache, journal); struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); + struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev); unsigned nr_buckets = bch_nr_journal_buckets(&ca->sb); DECLARE_BITMAP(bitmap, nr_buckets); - unsigned i, l, r, m; + unsigned i, l, r; u64 seq = 0; bitmap_zero(bitmap, nr_buckets); pr_debug("%u journal buckets", nr_buckets); - if (!blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev))) + if (!blk_queue_nonrot(q)) goto linear_scan; /* + * If the device supports discard but not secure discard, we can't do + * the fancy fibonacci hash/binary search because the live journal + * entries might not form a contiguous range: + */ + if (blk_queue_discard(q) && + !blk_queue_secure_erase(q)) { + for (i = 0; i < nr_buckets; i++) + read_bucket(i); + goto search_done; + } + + /* * Read journal buckets ordered by golden ratio hash to quickly * find a sequence of buckets with valid journal entries */ @@ -359,14 +420,13 @@ linear_scan: closure_return(cl); bsearch: /* Binary search */ - m = l; r = find_next_bit(bitmap, nr_buckets, l + 1); pr_debug("starting binary search, l %u r %u", l, r); while (l + 1 < r) { + unsigned m = (l + r) >> 1; u64 cur_seq = seq; - m = (l + r) >> 1; read_bucket(m); if (cur_seq != seq) @@ -375,45 +435,39 @@ bsearch: r = m; } - /* - * Read buckets in reverse order until we stop finding more - * journal entries - */ - pr_debug("finishing up: m %u njournal_buckets %u", - m, nr_buckets); - l = m; - - while (1) { - if (!l--) - l = nr_buckets - 1; - - if (l == m) - break; - - if (test_bit(l, bitmap)) - continue; - - if (!read_bucket(l)) - break; - } - +search_done: + /* Find the journal bucket with the highest sequence number: */ seq = 0; for (i = 0; i < nr_buckets; i++) - if (ja->seq[i] > seq) { - seq = ja->seq[i]; + if (ja->bucket_seq[i] > seq) { /* * When journal_next_bucket() goes to allocate for * the first time, it'll use the bucket after * ja->cur_idx */ ja->cur_idx = i; - ja->last_idx = ja->discard_idx = (i + 1) % - nr_buckets; - pr_debug("cur_idx %d last_idx %d", - ja->cur_idx, ja->last_idx); + seq = ja->bucket_seq[i]; } + /* + * Set last_idx and discard_idx to indicate the entire journal is full + * and needs to be reclaimed - journal reclaim will immediately reclaim + * whatever isn't pinned when it first runs: + */ + ja->last_idx = ja->discard_idx = (i + 1) % nr_buckets; + + /* + * Read buckets in reverse order until we stop finding more journal + * entries: + */ + for (i = (ja->cur_idx + nr_buckets - 1) % nr_buckets; + i != ja->cur_idx; + i = (i + nr_buckets - 1) % nr_buckets) + if (!test_bit(i, bitmap) && + !read_bucket(i)) + break; + closure_return(cl); #undef read_bucket } @@ -605,11 +659,13 @@ static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr) if (ret) return ret; - p = krealloc(ca->journal.seq, nr * sizeof(u64), GFP_KERNEL|__GFP_ZERO); + p = krealloc(ca->journal.bucket_seq, + nr * sizeof(u64), + GFP_KERNEL|__GFP_ZERO); if (!p) return -ENOMEM; - ca->journal.seq = p; + ca->journal.bucket_seq = p; ca->sb.u64s = u64s; return 0; @@ -801,7 +857,7 @@ static void journal_reclaim_work(struct work_struct *work) */ if (journal_free_buckets(ca) < (nr >> 1)) { oldest_seq = max_t(u64, oldest_seq, - ja->seq[next]); + ja->bucket_seq[next]); flush = true; } } @@ -869,7 +925,7 @@ static void journal_reclaim_fast(struct cache_set *c) unsigned nr = bch_nr_journal_buckets(&ca->sb); while (ja->last_idx != ja->cur_idx && - ja->seq[ja->last_idx] < last_seq) + ja->bucket_seq[ja->last_idx] < last_seq) ja->last_idx = (ja->last_idx + 1) % nr; /* @@ -953,7 +1009,8 @@ static void journal_next_bucket(struct cache_set *c) * will make another bucket available: */ if (remaining == 1 && - ja->seq[(next + 1) % nr_buckets] >= last_seq(&c->journal)) + ja->bucket_seq[(next + 1) % nr_buckets] >= + last_seq(&c->journal)) continue; BUG_ON(bch_extent_ptrs(e) >= BKEY_EXTENT_PTRS_MAX); @@ -966,6 +1023,8 @@ static void journal_next_bucket(struct cache_set *c) journal_bucket(ca, ja->cur_idx)), ca->sb.nr_this_dev); + ja->bucket_seq[ja->cur_idx] = c->journal.seq; + trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx, ja->discard_idx); bch_set_extent_ptrs(e, bch_extent_ptrs(e) + 1); @@ -1133,7 +1192,7 @@ static void journal_write_locked(struct closure *cl) SET_PTR_OFFSET(ptr, PTR_OFFSET(ptr) + sectors); - ca->journal.seq[ca->journal.cur_idx] = w->data->seq; + ca->journal.bucket_seq[ca->journal.cur_idx] = w->data->seq; } /* @@ -1495,7 +1554,7 @@ int bch_journal_move(struct cache *ca) nr_buckets = bch_nr_journal_buckets(&ca->sb); for (i = 0; i < nr_buckets; i += 1) - BUG_ON(ca->journal.seq[i] > last_flushed_seq); + BUG_ON(ca->journal.bucket_seq[i] > last_flushed_seq); return ret; } diff --git a/drivers/md/bcache/journal_types.h b/drivers/md/bcache/journal_types.h index 971af1e043ca..395916f9bd87 100644 --- a/drivers/md/bcache/journal_types.h +++ b/drivers/md/bcache/journal_types.h @@ -88,7 +88,7 @@ struct journal_device { * For each journal bucket, contains the max sequence number of the * journal writes it contains - so we know when a bucket can be reused. */ - u64 *seq; + u64 *bucket_seq; unsigned sectors_free; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b677fc55fa0a..6375134a4515 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1484,7 +1484,7 @@ static void bch_cache_free_work(struct work_struct *work) bioset_free(ca->replica_set); free_percpu(ca->bucket_stats_percpu); - kfree(ca->journal.seq); + kfree(ca->journal.bucket_seq); free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); kfree(ca->prio_buckets); kfree(ca->bio_prio); @@ -1870,8 +1870,8 @@ static const char *cache_alloc(struct bcache_superblock *sb, !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || !(ca->replica_set = bioset_create(4, offsetof(struct bbio, bio))) || !(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats)) || - !(ca->journal.seq = kcalloc(bch_nr_journal_buckets(&ca->sb), - sizeof(u64), GFP_KERNEL)) || + !(ca->journal.bucket_seq = kcalloc(bch_nr_journal_buckets(&ca->sb), + sizeof(u64), GFP_KERNEL)) || !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca)))) goto err; diff --git a/drivers/md/bcache/super.h b/drivers/md/bcache/super.h index 9c46b724cff9..789027ab2c2c 100644 --- a/drivers/md/bcache/super.h +++ b/drivers/md/bcache/super.h @@ -85,8 +85,8 @@ u64 bch_checksum(unsigned, const void *, size_t); */ #define csum_set(i, type) \ ({ \ - void *start = ((void *) (i)) + sizeof(u64); \ - void *end = __bset_bkey_last(i); \ + const void *start = ((const void *) (i)) + sizeof(u64); \ + const void *end = __bset_bkey_last(i); \ \ bch_checksum(type, start, end - start); \ }) |