diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2019-06-30 16:28:01 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2020-05-06 17:14:16 -0400 |
commit | ea5715a73506eb929e43b66eb3b87c94e2b44ab4 (patch) | |
tree | a145b47f47c831f20c6ee694995a5f9b7e2e6e31 /fs/bcachefs/journal_io.c | |
parent | 5f6131b81dfa624673447c41cfb69c151086b802 (diff) |
Merge with 1f431b384d bcachefs: Refactor trans_(get|update)_key
Diffstat (limited to 'fs/bcachefs/journal_io.c')
-rw-r--r-- | fs/bcachefs/journal_io.c | 797 |
1 files changed, 252 insertions, 545 deletions
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 36ba6a4daf84..af135e263a3f 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1,49 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "alloc.h" -#include "btree_gc.h" -#include "btree_update.h" +#include "alloc_foreground.h" #include "buckets.h" #include "checksum.h" #include "error.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" -#include "journal_seq_blacklist.h" #include "replicas.h" #include <trace/events/bcachefs.h> -static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type, - enum btree_id id) -{ - struct jset_entry *entry; - - for_each_jset_entry_type(entry, j, type) - if (entry->btree_id == id) - return entry; - - return NULL; -} - -struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j, - enum btree_id id, unsigned *level) -{ - struct bkey_i *k; - struct jset_entry *entry = - bch2_journal_find_entry(j, BCH_JSET_ENTRY_btree_root, id); - - if (!entry) - return NULL; - - if (!entry->u64s) - return ERR_PTR(-EINVAL); - - k = entry->start; - *level = entry->level; - *level = entry->level; - return k; -} - struct journal_list { struct closure cl; struct mutex lock; @@ -171,12 +138,12 @@ static void journal_entry_null_range(void *start, void *end) static int journal_validate_key(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, - struct bkey_i *k, enum bkey_type key_type, + struct bkey_i *k, enum btree_node_type key_type, const char *type, int write) { void *next = vstruct_next(entry); const char *invalid; - char buf[160]; + unsigned version = le32_to_cpu(jset->version); int ret = 0; if (journal_entry_err_on(!k->k.u64s, c, @@ -205,12 +172,17 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, } if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) - bch2_bkey_swab(key_type, NULL, bkey_to_packed(k)); + bch2_bkey_swab(NULL, bkey_to_packed(k)); - invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k)); + if (!write && + version < bcachefs_metadata_version_bkey_renumber) + bch2_bkey_renumber(key_type, bkey_to_packed(k), write); + + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), key_type); if (invalid) { - bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf), - bkey_i_to_s_c(k)); + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", type, invalid, buf); @@ -219,6 +191,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, journal_entry_null_range(vstruct_next(entry), next); return 0; } + + if (write && + version < bcachefs_metadata_version_bkey_renumber) + bch2_bkey_renumber(key_type, bkey_to_packed(k), write); fsck_err: return ret; } @@ -232,8 +208,8 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c, vstruct_for_each(entry, k) { int ret = journal_validate_key(c, jset, entry, k, - bkey_type(entry->level, - entry->btree_id), + __btree_node_type(entry->level, + entry->btree_id), "key", write); if (ret) return ret; @@ -305,6 +281,7 @@ static int journal_entry_validate_blacklist_v2(struct bch_fs *c, if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); + goto out; } bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); @@ -314,6 +291,49 @@ static int journal_entry_validate_blacklist_v2(struct bch_fs *c, "invalid journal seq blacklist entry: start > end")) { journal_entry_null_range(entry, vstruct_next(entry)); } +out: +fsck_err: + return ret; +} + +static int journal_entry_validate_usage(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes < sizeof(*u), + c, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + +static int journal_entry_validate_data_usage(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes < sizeof(*u) || + bytes < sizeof(*u) + u->r.nr_devs, + c, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } fsck_err: return ret; @@ -336,18 +356,10 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = { static int journal_entry_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, int write) { - int ret = 0; - - if (entry->type >= BCH_JSET_ENTRY_NR) { - journal_entry_err(c, "invalid journal entry type %u", - entry->type); - journal_entry_null_range(entry, vstruct_next(entry)); - return 0; - } - - ret = bch2_jset_entry_ops[entry->type].validate(c, jset, entry, write); -fsck_err: - return ret; + return entry->type < BCH_JSET_ENTRY_NR + ? bch2_jset_entry_ops[entry->type].validate(c, jset, + entry, write) + : 0; } static int jset_validate_entries(struct bch_fs *c, struct jset *jset, @@ -380,14 +392,17 @@ static int jset_validate(struct bch_fs *c, { size_t bytes = vstruct_bytes(jset); struct bch_csum csum; + unsigned version; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; - if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) { - bch_err(c, "unknown journal entry version %u", - le32_to_cpu(jset->version)); + version = le32_to_cpu(jset->version); + if ((version != BCH_JSET_VERSION_OLD && + version < bcachefs_metadata_version_min) || + version >= bcachefs_metadata_version_max) { + bch_err(c, "unknown journal entry version %u", jset->version); return BCH_FSCK_UNKNOWN_VERSION; } @@ -455,11 +470,10 @@ static int journal_read_buf_realloc(struct journal_read_buf *b, static int journal_read_bucket(struct bch_dev *ca, struct journal_read_buf *buf, struct journal_list *jlist, - unsigned bucket, u64 *seq, bool *entries_found) + unsigned bucket) { struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; - struct bio *bio = ja->bio; struct jset *j = NULL; unsigned sectors, sectors_read = 0; u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), @@ -471,10 +485,14 @@ static int journal_read_bucket(struct bch_dev *ca, while (offset < end) { if (!sectors_read) { -reread: sectors_read = min_t(unsigned, + struct bio *bio; +reread: + sectors_read = min_t(unsigned, end - offset, buf->size >> 9); - bio_reset(bio); + bio = bio_kmalloc(GFP_KERNEL, + buf_pages(buf->data, + sectors_read << 9)); bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = offset; bio->bi_iter.bi_size = sectors_read << 9; @@ -482,6 +500,7 @@ reread: sectors_read = min_t(unsigned, bch2_bio_map(bio, buf->data); ret = submit_bio_wait(bio); + bio_put(bio); if (bch2_dev_io_err_on(ret, ca, "journal read from sector %llu", @@ -536,7 +555,6 @@ reread: sectors_read = min_t(unsigned, switch (ret) { case JOURNAL_ENTRY_ADD_OK: - *entries_found = true; break; case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: break; @@ -544,9 +562,6 @@ reread: sectors_read = min_t(unsigned, return ret; } - if (le64_to_cpu(j->seq) > *seq) - *seq = le64_to_cpu(j->seq); - sectors = vstruct_sectors(j, c->block_bits); next_block: pr_debug("next"); @@ -560,138 +575,59 @@ next_block: static void bch2_journal_read_device(struct closure *cl) { -#define read_bucket(b) \ - ({ \ - bool entries_found = false; \ - ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \ - &entries_found); \ - if (ret) \ - goto err; \ - __set_bit(b, bitmap); \ - entries_found; \ - }) - struct journal_device *ja = container_of(cl, struct journal_device, read); struct bch_dev *ca = container_of(ja, struct bch_dev, journal); struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); - struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev); struct journal_read_buf buf = { NULL, 0 }; - - DECLARE_BITMAP(bitmap, ja->nr); - unsigned i, l, r; - u64 seq = 0; + u64 min_seq = U64_MAX; + unsigned i; int ret; if (!ja->nr) goto out; - bitmap_zero(bitmap, ja->nr); ret = journal_read_buf_realloc(&buf, PAGE_SIZE); if (ret) goto err; pr_debug("%u journal buckets", ja->nr); - /* - * If the device supports discard but not secure discard, we can't do - * the fancy fibonacci hash/binary search because the live journal - * entries might not form a contiguous range: - */ - for (i = 0; i < ja->nr; i++) - read_bucket(i); - goto search_done; - - if (!blk_queue_nonrot(q)) - goto linear_scan; - - /* - * Read journal buckets ordered by golden ratio hash to quickly - * find a sequence of buckets with valid journal entries - */ for (i = 0; i < ja->nr; i++) { - l = (i * 2654435769U) % ja->nr; - - if (test_bit(l, bitmap)) - break; - - if (read_bucket(l)) - goto bsearch; + ret = journal_read_bucket(ca, &buf, jlist, i); + if (ret) + goto err; } - /* - * If that fails, check all the buckets we haven't checked - * already - */ - pr_debug("falling back to linear search"); -linear_scan: - for (l = find_first_zero_bit(bitmap, ja->nr); - l < ja->nr; - l = find_next_zero_bit(bitmap, ja->nr, l + 1)) - if (read_bucket(l)) - goto bsearch; - - /* no journal entries on this device? */ - if (l == ja->nr) - goto out; -bsearch: - /* Binary search */ - r = find_next_bit(bitmap, ja->nr, l + 1); - pr_debug("starting binary search, l %u r %u", l, r); - - while (l + 1 < r) { - unsigned m = (l + r) >> 1; - u64 cur_seq = seq; - - read_bucket(m); + /* Find the journal bucket with the highest sequence number: */ + for (i = 0; i < ja->nr; i++) { + if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) + ja->cur_idx = i; - if (cur_seq != seq) - l = m; - else - r = m; + min_seq = min(ja->bucket_seq[i], min_seq); } -search_done: /* - * Find the journal bucket with the highest sequence number: - * * If there's duplicate journal entries in multiple buckets (which * definitely isn't supposed to happen, but...) - make sure to start * cur_idx at the last of those buckets, so we don't deadlock trying to * allocate */ - seq = 0; + while (ja->bucket_seq[ja->cur_idx] > min_seq && + ja->bucket_seq[ja->cur_idx] > + ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - for (i = 0; i < ja->nr; i++) - if (ja->bucket_seq[i] >= seq && - ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) { - /* - * When journal_next_bucket() goes to allocate for - * the first time, it'll use the bucket after - * ja->cur_idx - */ - ja->cur_idx = i; - seq = ja->bucket_seq[i]; - } + ja->sectors_free = 0; /* - * Set last_idx to indicate the entire journal is full and needs to be + * Set dirty_idx to indicate the entire journal is full and needs to be * reclaimed - journal reclaim will immediately reclaim whatever isn't * pinned when it first runs: */ - ja->last_idx = (ja->cur_idx + 1) % ja->nr; - - /* - * Read buckets in reverse order until we stop finding more journal - * entries: - */ - for (i = (ja->cur_idx + ja->nr - 1) % ja->nr; - i != ja->cur_idx; - i = (i + ja->nr - 1) % ja->nr) - if (!test_bit(i, bitmap) && - !read_bucket(i)) - break; + ja->discard_idx = ja->dirty_idx_ondisk = + ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; out: kvpfree(buf.data, buf.size); percpu_ref_put(&ca->io_ref); @@ -702,32 +638,15 @@ err: jlist->ret = ret; mutex_unlock(&jlist->lock); goto out; -#undef read_bucket -} - -void bch2_journal_entries_free(struct list_head *list) -{ - - while (!list_empty(list)) { - struct journal_replay *i = - list_first_entry(list, struct journal_replay, list); - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); - } } int bch2_journal_read(struct bch_fs *c, struct list_head *list) { - struct journal *j = &c->journal; struct journal_list jlist; struct journal_replay *i; - struct journal_entry_pin_list *p; struct bch_dev *ca; - u64 cur_seq, end_seq, seq; unsigned iter; - size_t entries = 0; - u64 nr, keys = 0; + size_t keys = 0, entries = 0; bool degraded = false; int ret = 0; @@ -737,7 +656,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) jlist.ret = 0; for_each_member_device(ca, c, iter) { - if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) + if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) continue; if ((ca->mi.state == BCH_MEMBER_STATE_RW || @@ -756,12 +676,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (jlist.ret) return jlist.ret; - if (list_empty(list)){ - bch_err(c, "no journal entries found"); - return BCH_FSCK_REPAIR_IMPOSSIBLE; - } - list_for_each_entry(i, list, list) { + struct jset_entry *entry; + struct bkey_i *k, *_n; + struct bch_replicas_padded replicas; + char buf[80]; + ret = jset_validate_entries(c, &i->j, READ); if (ret) goto fsck_err; @@ -771,294 +691,89 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) * the devices - this is wrong: */ + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); + if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL, - i->devs), c, - "superblock not marked as containing replicas (type %u)", - BCH_DATA_JOURNAL))) { - ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs); + fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c, + "superblock not marked as containing replicas %s", + (bch2_replicas_entry_to_text(&PBUF(buf), + &replicas.e), buf)))) { + ret = bch2_mark_replicas(c, &replicas.e); if (ret) return ret; } - } - - list_for_each_entry(i, list, list) { - struct jset_entry *entry; - struct bkey_i *k, *_n; for_each_jset_key(k, _n, entry, &i->j) keys++; - } - - i = list_last_entry(list, struct journal_replay, list); - - nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1; - - fsck_err_on(c->sb.clean && (keys || nr > 1), c, - "filesystem marked clean but journal not empty (%llu keys in %llu entries)", - keys, nr); - - if (nr > j->pin.size) { - free_fifo(&j->pin); - init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -ENOMEM; - } - } - - atomic64_set(&j->seq, le64_to_cpu(i->j.seq)); - j->last_seq_ondisk = le64_to_cpu(i->j.last_seq); - - j->pin.front = le64_to_cpu(i->j.last_seq); - j->pin.back = le64_to_cpu(i->j.seq) + 1; - - fifo_for_each_entry_ptr(p, &j->pin, seq) { - INIT_LIST_HEAD(&p->list); - INIT_LIST_HEAD(&p->flushed); - atomic_set(&p->count, 0); - p->devs.nr = 0; - } - - mutex_lock(&j->blacklist_lock); - - list_for_each_entry(i, list, list) { - p = journal_seq_pin(j, le64_to_cpu(i->j.seq)); - - atomic_set(&p->count, 1); - p->devs = i->devs; - - if (bch2_journal_seq_blacklist_read(j, i)) { - mutex_unlock(&j->blacklist_lock); - return -ENOMEM; - } - } - - mutex_unlock(&j->blacklist_lock); - - cur_seq = journal_last_seq(j); - end_seq = le64_to_cpu(list_last_entry(list, - struct journal_replay, list)->j.seq); - - list_for_each_entry(i, list, list) { - bool blacklisted; - - mutex_lock(&j->blacklist_lock); - while (cur_seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_blacklist_find(j, cur_seq)) - cur_seq++; - - blacklisted = bch2_journal_seq_blacklist_find(j, - le64_to_cpu(i->j.seq)); - mutex_unlock(&j->blacklist_lock); - - fsck_err_on(blacklisted, c, - "found blacklisted journal entry %llu", - le64_to_cpu(i->j.seq)); - - fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c, - "journal entries %llu-%llu missing! (replaying %llu-%llu)", - cur_seq, le64_to_cpu(i->j.seq) - 1, - journal_last_seq(j), end_seq); - - cur_seq = le64_to_cpu(i->j.seq) + 1; entries++; } - bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu", - keys, entries, journal_cur_seq(j)); -fsck_err: - return ret; -} - -/* journal replay: */ + if (!list_empty(list)) { + i = list_last_entry(list, struct journal_replay, list); -int bch2_journal_mark(struct bch_fs *c, struct list_head *list) -{ - struct bkey_i *k, *n; - struct jset_entry *j; - struct journal_replay *r; - int ret; - - list_for_each_entry(r, list, list) - for_each_jset_key(k, n, j, &r->j) { - enum bkey_type type = bkey_type(j->level, j->btree_id); - struct bkey_s_c k_s_c = bkey_i_to_s_c(k); - - if (btree_type_has_ptrs(type)) { - ret = bch2_btree_mark_key_initial(c, type, k_s_c); - if (ret) - return ret; - } - } - - return 0; -} - -int bch2_journal_replay(struct bch_fs *c, struct list_head *list) -{ - struct journal *j = &c->journal; - struct journal_entry_pin_list *pin_list; - struct bkey_i *k, *_n; - struct jset_entry *entry; - struct journal_replay *i, *n; - int ret = 0; - - list_for_each_entry_safe(i, n, list, list) { - - j->replay_journal_seq = le64_to_cpu(i->j.seq); - - for_each_jset_key(k, _n, entry, &i->j) { - - if (entry->btree_id == BTREE_ID_ALLOC) { - /* - * allocation code handles replay for - * BTREE_ID_ALLOC keys: - */ - ret = bch2_alloc_replay_key(c, k->k.p); - } else { - /* - * We might cause compressed extents to be - * split, so we need to pass in a - * disk_reservation: - */ - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - - ret = bch2_btree_insert(c, entry->btree_id, k, - &disk_res, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_REPLAY); - } - - if (ret) { - bch_err(c, "journal replay: error %d while replaying key", - ret); - goto err; - } - - cond_resched(); - } - - pin_list = journal_seq_pin(j, j->replay_journal_seq); - - if (atomic_dec_and_test(&pin_list->count)) - journal_wake(j); + bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", + keys, entries, le64_to_cpu(i->j.seq)); } - - j->replay_journal_seq = 0; - - bch2_journal_set_replay_done(j); - ret = bch2_journal_flush_all_pins(j); -err: - bch2_journal_entries_free(list); +fsck_err: return ret; } /* journal write: */ -static void bch2_journal_add_btree_root(struct journal_buf *buf, - enum btree_id id, struct bkey_i *k, - unsigned level) -{ - struct jset_entry *entry; - - entry = bch2_journal_add_entry_noreservation(buf, k->k.u64s); - entry->type = BCH_JSET_ENTRY_btree_root; - entry->btree_id = id; - entry->level = level; - memcpy_u64s(entry->_data, k, k->k.u64s); -} - -static unsigned journal_dev_buckets_available(struct journal *j, - struct bch_dev *ca) -{ - struct journal_device *ja = &ca->journal; - unsigned next = (ja->cur_idx + 1) % ja->nr; - unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; - - /* - * Hack to avoid a deadlock during journal replay: - * journal replay might require setting a new btree - * root, which requires writing another journal entry - - * thus, if the journal is full (and this happens when - * replaying the first journal bucket's entries) we're - * screwed. - * - * So don't let the journal fill up unless we're in - * replay: - */ - if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) - available = max((int) available - 2, 0); - - /* - * Don't use the last bucket unless writing the new last_seq - * will make another bucket available: - */ - if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j)) - available = max((int) available - 1, 0); - - return available; -} - -/* returns number of sectors available for next journal entry: */ -int bch2_journal_entry_sectors(struct journal *j) +static void __journal_write_alloc(struct journal *j, + struct journal_buf *w, + struct dev_alloc_list *devs_sorted, + unsigned sectors, + unsigned *replicas, + unsigned replicas_want) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_device *ja; struct bch_dev *ca; - struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); - unsigned sectors_available = UINT_MAX; - unsigned i, nr_online = 0, nr_devs = 0; + unsigned i; - lockdep_assert_held(&j->lock); - - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_JOURNAL]) { - struct journal_device *ja = &ca->journal; - unsigned buckets_required = 0; + if (*replicas >= replicas_want) + return; - if (!ja->nr) + for (i = 0; i < devs_sorted->nr; i++) { + ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); + if (!ca) continue; - sectors_available = min_t(unsigned, sectors_available, - ca->mi.bucket_size); + ja = &ca->journal; /* - * Note that we don't allocate the space for a journal entry - * until we write it out - thus, if we haven't started the write - * for the previous entry we have to make sure we have space for - * it too: + * Check that we can use this device, and aren't already using + * it: */ - if (bch2_extent_has_device(e.c, ca->dev_idx)) { - if (j->prev_buf_sectors > ja->sectors_free) - buckets_required++; - - if (j->prev_buf_sectors + sectors_available > - ja->sectors_free) - buckets_required++; - } else { - if (j->prev_buf_sectors + sectors_available > - ca->mi.bucket_size) - buckets_required++; - - buckets_required++; - } + if (!ca->mi.durability || + ca->mi.state != BCH_MEMBER_STATE_RW || + !ja->nr || + bch2_bkey_has_device(bkey_i_to_s_c(&w->key), + ca->dev_idx) || + sectors > ja->sectors_free) + continue; - if (journal_dev_buckets_available(j, ca) >= buckets_required) - nr_devs++; - nr_online++; - } - rcu_read_unlock(); + bch2_dev_stripe_increment(c, ca, &j->wp.stripe); + + bch2_bkey_append_ptr(&w->key, + (struct bch_extent_ptr) { + .offset = bucket_to_sector(ca, + ja->buckets[ja->cur_idx]) + + ca->mi.bucket_size - + ja->sectors_free, + .dev = ca->dev_idx, + }); - if (nr_online < c->opts.metadata_replicas_required) - return -EROFS; + ja->sectors_free -= sectors; + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas)) - return 0; + *replicas += ca->mi.durability; - return sectors_available; + if (*replicas >= replicas_want) + break; + } } /** @@ -1068,95 +783,51 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, unsigned sectors) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bkey_s_extent e; - struct bch_extent_ptr *ptr; struct journal_device *ja; struct bch_dev *ca; struct dev_alloc_list devs_sorted; - unsigned i, replicas, replicas_want = + unsigned i, replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); - spin_lock(&j->lock); - e = bkey_i_to_s_extent(&j->key); - - /* - * Drop any pointers to devices that have been removed, are no longer - * empty, or filled up their current journal bucket: - * - * Note that a device may have had a small amount of free space (perhaps - * one sector) that wasn't enough for the smallest possible journal - * entry - that's why we drop pointers to devices <= current free space, - * i.e. whichever device was limiting the current journal entry size. - */ - extent_for_each_ptr_backwards(e, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); + rcu_read_lock(); - if (ca->mi.state != BCH_MEMBER_STATE_RW || - ca->journal.sectors_free <= sectors) - __bch2_extent_drop_ptr(e, ptr); - else - ca->journal.sectors_free -= sectors; - } + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, + &c->rw_devs[BCH_DATA_JOURNAL]); - replicas = bch2_extent_nr_ptrs(e.c); + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); - rcu_read_lock(); - devs_sorted = bch2_wp_alloc_list(c, &j->wp, - &c->rw_devs[BCH_DATA_JOURNAL]); + if (replicas >= replicas_want) + goto done; for (i = 0; i < devs_sorted.nr; i++) { ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); if (!ca) continue; - if (!ca->mi.durability) - continue; - ja = &ca->journal; - if (!ja->nr) - continue; - - if (replicas >= replicas_want) - break; - - /* - * Check that we can use this device, and aren't already using - * it: - */ - if (bch2_extent_has_device(e.c, ca->dev_idx) || - !journal_dev_buckets_available(j, ca) || - sectors > ca->mi.bucket_size) - continue; - - j->wp.next_alloc[ca->dev_idx] += U32_MAX; - bch2_wp_rescale(c, ca, &j->wp); - - ja->sectors_free = ca->mi.bucket_size - sectors; - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - extent_ptr_append(bkey_i_to_extent(&j->key), - (struct bch_extent_ptr) { - .offset = bucket_to_sector(ca, - ja->buckets[ja->cur_idx]), - .dev = ca->dev_idx, - }); + if (sectors > ja->sectors_free && + sectors <= ca->mi.bucket_size && + bch2_journal_dev_buckets_available(j, ja, + journal_space_discarded)) { + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ja->sectors_free = ca->mi.bucket_size; - replicas += ca->mi.durability; + /* + * ja->bucket_seq[ja->cur_idx] must always have + * something sensible: + */ + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); + } } - rcu_read_unlock(); - - j->prev_buf_sectors = 0; - - bkey_copy(&w->key, &j->key); - spin_unlock(&j->lock); - if (replicas < c->opts.metadata_replicas_required) - return -EROFS; - - BUG_ON(!replicas); + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); +done: + rcu_read_unlock(); - return 0; + return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; } static void journal_write_compact(struct jset *jset) @@ -1208,17 +879,17 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) unsigned new_size = READ_ONCE(j->buf_size_want); void *new_buf; - if (buf->size >= new_size) + if (buf->buf_size >= new_size) return; new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); if (!new_buf) return; - memcpy(new_buf, buf->data, buf->size); - kvpfree(buf->data, buf->size); + memcpy(new_buf, buf->data, buf->buf_size); + kvpfree(buf->data, buf->buf_size); buf->data = new_buf; - buf->size = new_size; + buf->buf_size = new_size; } static void journal_write_done(struct closure *cl) @@ -1227,24 +898,31 @@ static void journal_write_done(struct closure *cl) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *w = journal_prev_buf(j); struct bch_devs_list devs = - bch2_extent_devs(bkey_i_to_s_c_extent(&w->key)); + bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + struct bch_replicas_padded replicas; u64 seq = le64_to_cpu(w->data->seq); + u64 last_seq = le64_to_cpu(w->data->last_seq); + + bch2_time_stats_update(j->write_time, j->write_start_time); if (!devs.nr) { bch_err(c, "unable to write journal to sufficient devices"); goto err; } - if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs)) + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs); + + if (bch2_mark_replicas(c, &replicas.e)) goto err; -out: - bch2_time_stats_update(j->write_time, j->write_start_time); spin_lock(&j->lock); - j->last_seq_ondisk = seq; if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = devs; + j->seq_ondisk = seq; + j->last_seq_ondisk = last_seq; + bch2_journal_space_available(j); + /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard * more buckets: @@ -1252,8 +930,8 @@ out: * Must come before signaling write completion, for * bch2_fs_journal_stop(): */ - mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0); - + mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); +out: /* also must come before signalling write completion: */ closure_debug_destroy(cl); @@ -1270,7 +948,7 @@ out: return; err: bch2_fatal_error(c); - bch2_journal_halt(j); + spin_lock(&j->lock); goto out; } @@ -1285,7 +963,7 @@ static void journal_write_endio(struct bio *bio) unsigned long flags; spin_lock_irqsave(&j->err_lock, flags); - bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx); + bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); } @@ -1299,36 +977,51 @@ void bch2_journal_write(struct closure *cl) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_prev_buf(j); + struct jset_entry *start, *end; struct jset *jset; struct bio *bio; struct bch_extent_ptr *ptr; - unsigned i, sectors, bytes; + bool validate_before_checksum = false; + unsigned i, sectors, bytes, u64s; + int ret; + + bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); journal_buf_realloc(j, w); jset = w->data; j->write_start_time = local_clock(); - mutex_lock(&c->btree_root_lock); - for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_root *r = &c->btree_roots[i]; - if (r->alive) - bch2_journal_add_btree_root(w, i, &r->key, r->level); - } - c->btree_roots_dirty = false; - mutex_unlock(&c->btree_root_lock); + start = vstruct_last(jset); + end = bch2_journal_super_entries_add_common(c, start, + le64_to_cpu(jset->seq)); + u64s = (u64 *) end - (u64 *) start; + BUG_ON(u64s > j->entry_u64s_reserved); + + le32_add_cpu(&jset->u64s, u64s); + BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); journal_write_compact(jset); jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = cpu_to_le32(BCACHE_JSET_VERSION); + + jset->version = c->sb.version < bcachefs_metadata_version_new_versioning + ? cpu_to_le32(BCH_JSET_VERSION_OLD) + : cpu_to_le32(c->sb.version); SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); - if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) && + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) + validate_before_checksum = true; + + if (le32_to_cpu(jset->version) < + bcachefs_metadata_version_bkey_renumber) + validate_before_checksum = true; + + if (validate_before_checksum && jset_validate_entries(c, jset, WRITE)) goto err; @@ -1339,18 +1032,33 @@ void bch2_journal_write(struct closure *cl) jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); - if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) && + if (!validate_before_checksum && jset_validate_entries(c, jset, WRITE)) goto err; sectors = vstruct_sectors(jset, c->block_bits); - BUG_ON(sectors > j->prev_buf_sectors); + BUG_ON(sectors > w->sectors); + + bytes = vstruct_bytes(jset); + memset((void *) jset + bytes, 0, (sectors << 9) - bytes); - bytes = vstruct_bytes(w->data); - memset((void *) w->data + bytes, 0, (sectors << 9) - bytes); + spin_lock(&j->lock); + ret = journal_write_alloc(j, w, sectors); + + /* + * write is allocated, no longer need to account for it in + * bch2_journal_space_available(): + */ + w->sectors = 0; + + /* + * journal entry has been compacted and allocated, recalculate space + * available: + */ + bch2_journal_space_available(j); + spin_unlock(&j->lock); - if (journal_write_alloc(j, w, sectors)) { - bch2_journal_halt(j); + if (ret) { bch_err(c, "Unable to allocate journal write"); bch2_fatal_error(c); continue_at(cl, journal_write_done, system_highpri_wq); @@ -1389,7 +1097,7 @@ void bch2_journal_write(struct closure *cl) trace_journal_write(bio); closure_bio_submit(bio, cl); - ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq); + ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); } for_each_rw_member(ca, c, i) @@ -1407,8 +1115,7 @@ void bch2_journal_write(struct closure *cl) } no_io: - extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) - ptr->offset += sectors; + bch2_bucket_seq_cleanup(c); continue_at(cl, journal_write_done, system_highpri_wq); return; |