diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2018-06-11 08:24:18 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2018-06-11 13:14:14 -0400 |
commit | 12daee0f9d39799816fd15e1f393bf169b0d8dcc (patch) | |
tree | cd73f0a2693dff1c502554d7aa6e07a4349ca08e | |
parent | 5c5aa6371e17dea55e51ed508759287a43f813e7 (diff) |
bcachefs: bch_sb_field_clean
Implement a superblock field so we don't have to read the journal after
a clean shutdown (and more importantly, we can verify what we find in
the journal after a clean shutdown)
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r-- | fs/bcachefs/bcachefs_format.h | 47 | ||||
-rw-r--r-- | fs/bcachefs/journal_io.c | 114 | ||||
-rw-r--r-- | fs/bcachefs/journal_io.h | 4 | ||||
-rw-r--r-- | fs/bcachefs/recovery.c | 167 | ||||
-rw-r--r-- | fs/bcachefs/super-io.c | 82 | ||||
-rw-r--r-- | fs/bcachefs/super-io.h | 4 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 14 |
7 files changed, 321 insertions, 111 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index dffb758ad834..8e74de4f8c32 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -878,7 +878,8 @@ struct bch_sb_field { x(crypt, 2) \ x(replicas, 3) \ x(quota, 4) \ - x(disk_groups, 5) + x(disk_groups, 5) \ + x(clean, 6) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -1048,6 +1049,37 @@ struct bch_sb_field_disk_groups { struct bch_disk_group entries[0]; }; +/* + * On clean shutdown, store btree roots and current journal sequence number in + * the superblock: + */ +struct jset_entry { + __le16 u64s; + __u8 btree_id; + __u8 level; + __u8 type; /* designates what this jset holds */ + __u8 pad[3]; + + union { + struct bkey_i start[0]; + __u64 _data[0]; + }; +}; + +struct bch_sb_field_clean { + struct bch_sb_field field; + + __le32 flags; + __le16 read_clock; + __le16 write_clock; + __le64 journal_seq; + + union { + struct jset_entry start[0]; + __u64 _data[0]; + }; +}; + /* Superblock: */ /* @@ -1265,19 +1297,6 @@ static inline __u64 __bset_magic(struct bch_sb *sb) #define BCACHE_JSET_VERSION_JKEYS 2 #define BCACHE_JSET_VERSION 2 -struct jset_entry { - __le16 u64s; - __u8 btree_id; - __u8 level; - __u8 type; /* designates what this jset holds */ - __u8 pad[3]; - - union { - struct bkey_i start[0]; - __u64 _data[0]; - }; -}; - #define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) #define BCH_JSET_ENTRY_TYPES() \ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 4f873ccc492a..8a4e7b2a92ce 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -13,37 +13,6 @@ #include <trace/events/bcachefs.h> -static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type, - enum btree_id id) -{ - struct jset_entry *entry; - - for_each_jset_entry_type(entry, j, type) - if (entry->btree_id == id) - return entry; - - return NULL; -} - -struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j, - enum btree_id id, unsigned *level) -{ - struct bkey_i *k; - struct jset_entry *entry = - bch2_journal_find_entry(j, BCH_JSET_ENTRY_btree_root, id); - - if (!entry) - return NULL; - - if (!entry->u64s) - return ERR_PTR(-EINVAL); - - k = entry->start; - *level = entry->level; - *level = entry->level; - return k; -} - struct journal_list { struct closure cl; struct mutex lock; @@ -717,6 +686,37 @@ void bch2_journal_entries_free(struct list_head *list) } } +int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq) +{ + struct journal *j = &c->journal; + struct journal_entry_pin_list *p; + u64 seq, nr = end_seq - last_seq + 1; + + if (nr > j->pin.size) { + free_fifo(&j->pin); + init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); + return -ENOMEM; + } + } + + atomic64_set(&j->seq, end_seq); + j->last_seq_ondisk = last_seq; + + j->pin.front = last_seq; + j->pin.back = end_seq + 1; + + fifo_for_each_entry_ptr(p, &j->pin, seq) { + INIT_LIST_HEAD(&p->list); + INIT_LIST_HEAD(&p->flushed); + atomic_set(&p->count, 0); + p->devs.nr = 0; + } + + return 0; +} + int bch2_journal_read(struct bch_fs *c, struct list_head *list) { struct journal *j = &c->journal; @@ -724,10 +724,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) struct journal_replay *i; struct journal_entry_pin_list *p; struct bch_dev *ca; - u64 cur_seq, end_seq, seq; + u64 cur_seq, end_seq; unsigned iter; - size_t entries = 0; - u64 nr, keys = 0; + size_t keys = 0, entries = 0; bool degraded = false; int ret = 0; @@ -783,43 +782,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) } } - list_for_each_entry(i, list, list) { - struct jset_entry *entry; - struct bkey_i *k, *_n; - - for_each_jset_key(k, _n, entry, &i->j) - keys++; - } - i = list_last_entry(list, struct journal_replay, list); - nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1; - - fsck_err_on(c->sb.clean && (keys || nr > 1), c, - "filesystem marked clean but journal not empty (%llu keys in %llu entries)", - keys, nr); - - if (nr > j->pin.size) { - free_fifo(&j->pin); - init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -ENOMEM; - } - } - - atomic64_set(&j->seq, le64_to_cpu(i->j.seq)); - j->last_seq_ondisk = le64_to_cpu(i->j.last_seq); - - j->pin.front = le64_to_cpu(i->j.last_seq); - j->pin.back = le64_to_cpu(i->j.seq) + 1; - - fifo_for_each_entry_ptr(p, &j->pin, seq) { - INIT_LIST_HEAD(&p->list); - INIT_LIST_HEAD(&p->flushed); - atomic_set(&p->count, 0); - p->devs.nr = 0; - } + ret = bch2_journal_set_seq(c, + le64_to_cpu(i->j.last_seq), + le64_to_cpu(i->j.seq)); + if (ret) + return ret; mutex_lock(&j->blacklist_lock); @@ -842,6 +811,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) struct journal_replay, list)->j.seq); list_for_each_entry(i, list, list) { + struct jset_entry *entry; + struct bkey_i *k, *_n; bool blacklisted; mutex_lock(&j->blacklist_lock); @@ -863,10 +834,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) journal_last_seq(j), end_seq); cur_seq = le64_to_cpu(i->j.seq) + 1; + + for_each_jset_key(k, _n, entry, &i->j) + keys++; entries++; } - bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu", + bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", keys, entries, journal_cur_seq(j)); fsck_err: return ret; diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 4236b7fc37ff..e303df9241de 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -1,9 +1,6 @@ #ifndef _BCACHEFS_JOURNAL_IO_H #define _BCACHEFS_JOURNAL_IO_H -struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *, - enum btree_id, unsigned *); - /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration @@ -37,6 +34,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) +int bch2_journal_set_seq(struct bch_fs *c, u64, u64); int bch2_journal_read(struct bch_fs *, struct list_head *); int bch2_journal_entry_sectors(struct journal *); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 271eca14753d..9881ab2a1e27 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -12,11 +12,118 @@ #include "recovery.h" #include "super-io.h" +struct bkey_i *btree_root_find(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct jset *j, + enum btree_id id, unsigned *level) +{ + struct bkey_i *k; + struct jset_entry *entry, *start, *end; + + if (clean) { + start = clean->start; + end = vstruct_end(&clean->field); + } else { + start = j->start; + end = vstruct_last(j); + } + + for (entry = start; entry < end; entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_btree_root && + entry->btree_id == id) + goto found; + + return NULL; +found: + if (!entry->u64s) + return ERR_PTR(-EINVAL); + + k = entry->start; + *level = entry->level; + return k; +} + +static int verify_superblock_clean(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct jset *j) +{ + unsigned i; + int ret = 0; + + if (!clean || !j) + return 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, + "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", + le64_to_cpu(clean->journal_seq), + le64_to_cpu(j->seq))) + bch2_fs_mark_clean(c, false); + + mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, + "superblock read clock doesn't match journal after clean shutdown"); + mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, + "superblock read clock doesn't match journal after clean shutdown"); + + for (i = 0; i < BTREE_ID_NR; i++) { + struct bkey_i *k1, *k2; + unsigned l1, l2; + + k1 = btree_root_find(c, clean, NULL, i, &l1); + k2 = btree_root_find(c, NULL, j, i, &l2); + + if (!k1 && !k2) + continue; + + if (!k1 || !k2 || + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(k1)) || + l1 != l2) + panic("k1 %px l1 %u k2 %px l2 %u\n", k1, l1, k2, l2); + + mustfix_fsck_err_on(!k1 || !k2 || + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(k1)) || + l1 != l2, c, + "superblock btree root doesn't match journal after clean shutdown"); + } +fsck_err: + return ret; +} + +static bool journal_empty(struct list_head *journal) +{ + struct journal_replay *i; + struct jset_entry *entry; + + if (list_empty(journal)) + return true; + + i = list_last_entry(journal, struct journal_replay, list); + + if (i->j.last_seq != i->j.seq) + return false; + + list_for_each_entry(i, journal, list) { + vstruct_for_each(&i->j, entry) { + if (entry->type == BCH_JSET_ENTRY_btree_root) + continue; + + if (entry->type == BCH_JSET_ENTRY_btree_keys && + !entry->u64s) + continue; + return false; + } + } + + return true; +} + int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; + struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL; LIST_HEAD(journal); - struct jset *j; + struct jset *j = NULL; unsigned i; int ret; @@ -25,22 +132,57 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "building replicas info"); set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } + + if (c->sb.clean) + sb_clean = bch2_sb_get_clean(c->disk_sb.sb); + if (sb_clean) { + clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), + GFP_KERNEL); + if (!clean) { + ret = -ENOMEM; + mutex_unlock(&c->sb_lock); + goto err; + } + } mutex_unlock(&c->sb_lock); - ret = bch2_journal_read(c, &journal); + if (clean) + bch_info(c, "recovering from clean shutdown, journal seq %llu", + le64_to_cpu(clean->journal_seq)); + + if (!clean || !c->opts.nofsck) { + ret = bch2_journal_read(c, &journal); + if (ret) + goto err; + + j = &list_entry(journal.prev, struct journal_replay, list)->j; + } else { + ret = bch2_journal_set_seq(c, + le64_to_cpu(clean->journal_seq), + le64_to_cpu(clean->journal_seq)); + BUG_ON(ret); + } + + ret = verify_superblock_clean(c, clean, j); if (ret) goto err; - j = &list_entry(journal.prev, struct journal_replay, list)->j; + fsck_err_on(clean && !journal_empty(&journal), c, + "filesystem marked clean but journal not empty"); - c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock); + if (clean) { + c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); + } else { + c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock); + } for (i = 0; i < BTREE_ID_NR; i++) { unsigned level; struct bkey_i *k; - k = bch2_journal_find_btree_root(c, j, i, &level); + k = btree_root_find(c, clean, j, i, &level); if (!k) continue; @@ -75,15 +217,17 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; bch_verbose(c, "mark and sweep done"); - mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - if (c->opts.noreplay) goto out; /* + * Mark dirty before journal replay, fsck: + * XXX: after a clean shutdown, this could be done lazily only when fsck + * finds an error + */ + bch2_fs_mark_clean(c, false); + + /* * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish() * will give spurious errors about oldest_gen > bucket_gen - * this is a hack but oh well. @@ -121,6 +265,7 @@ int bch2_fs_recovery(struct bch_fs *c) out: bch2_journal_entries_free(&journal); + kfree(clean); return ret; err: fsck_err: diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 9772d5973078..54de9fac6e22 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -4,6 +4,7 @@ #include "disk_groups.h" #include "error.h" #include "io.h" +#include "journal.h" #include "replicas.h" #include "quota.h" #include "super-io.h" @@ -89,6 +90,9 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) struct bch_sb *new_sb; struct bio *bio; + if (sb->sb && sb->page_order >= order) + return 0; + if (sb->have_layout) { u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; @@ -849,6 +853,84 @@ static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { .validate = bch2_sb_validate_crypt, }; +/* BCH_SB_FIELD_clean: */ + +void bch2_fs_mark_clean(struct bch_fs *c, bool clean) +{ + struct bch_sb_field_clean *sb_clean; + unsigned u64s = sizeof(*sb_clean) / sizeof(u64); + struct jset_entry *entry; + struct btree_root *r; + + mutex_lock(&c->sb_lock); + if (clean == BCH_SB_CLEAN(c->disk_sb.sb)) + goto out; + + SET_BCH_SB_CLEAN(c->disk_sb.sb, clean); + + if (!clean) + goto write_super; + + mutex_lock(&c->btree_root_lock); + + for (r = c->btree_roots; + r < c->btree_roots + BTREE_ID_NR; + r++) + if (r->alive) + u64s += jset_u64s(r->key.u64s); + + sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); + if (!sb_clean) { + bch_err(c, "error resizing superblock while setting filesystem clean"); + goto out; + } + + sb_clean->flags = 0; + sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); + sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); + sb_clean->journal_seq = journal_cur_seq(&c->journal) - 1; + + entry = sb_clean->start; + memset(entry, 0, + vstruct_end(&sb_clean->field) - (void *) entry); + + for (r = c->btree_roots; + r < c->btree_roots + BTREE_ID_NR; + r++) + if (r->alive) { + entry->u64s = r->key.u64s; + entry->btree_id = r - c->btree_roots; + entry->level = r->level; + entry->type = BCH_JSET_ENTRY_btree_root; + bkey_copy(&entry->start[0], &r->key); + entry = vstruct_next(entry); + BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); + } + + BUG_ON(entry != vstruct_end(&sb_clean->field)); + + mutex_unlock(&c->btree_root_lock); +write_super: + bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); +} + +static const char *bch2_sb_validate_clean(struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + + if (vstruct_bytes(&clean->field) < sizeof(*clean)) + return "invalid field crypt: wrong size"; + + return NULL; +} + +static const struct bch_sb_field_ops bch_sb_field_ops_clean = { + .validate = bch2_sb_validate_clean, +}; + static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { #define x(f, nr) \ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 995b1c907318..7d09d8e45816 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -131,6 +131,10 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) }; } +/* BCH_SB_FIELD_clean: */ + +void bch2_fs_mark_clean(struct bch_fs *, bool); + size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *, struct bch_sb_field *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 6cafbdb888ee..a2a32b924434 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -199,16 +199,6 @@ int bch2_congested(void *data, int bdi_bits) * - allocator depends on the journal (when it rewrites prios and gens) */ -static void bch2_fs_mark_clean(struct bch_fs *c, bool clean) -{ - mutex_lock(&c->sb_lock); - if (BCH_SB_CLEAN(c->disk_sb.sb) != clean) { - SET_BCH_SB_CLEAN(c->disk_sb.sb, clean); - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); -} - static void __bch2_fs_read_only(struct bch_fs *c) { struct bch_dev *ca; @@ -311,10 +301,8 @@ void bch2_fs_read_only(struct bch_fs *c) if (!bch2_journal_error(&c->journal) && !test_bit(BCH_FS_ERROR, &c->flags) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) { - + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) bch2_fs_mark_clean(c, true); - } if (c->state != BCH_FS_STOPPING) c->state = BCH_FS_RO; |