diff options
Diffstat (limited to 'libbcachefs')
59 files changed, 873 insertions, 452 deletions
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 66de4631..d64839c7 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -1381,7 +1381,7 @@ static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct u8 gen; ret = k.k->type != KEY_TYPE_set - ? bch2_check_discard_freespace_key(trans, &iter, &gen, false) + ? __bch2_check_discard_freespace_key(trans, &iter, &gen, FSCK_ERR_SILENT) : 0; bch2_trans_iter_exit(trans, &iter); return ret; @@ -1397,8 +1397,8 @@ static void check_discard_freespace_key_work(struct work_struct *work) kfree(w); } -int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, - bool async_repair) +int __bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, + enum bch_fsck_flags fsck_flags) { struct bch_fs *c = trans->c; enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard @@ -1406,8 +1406,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite : BCH_DATA_free; struct printbuf buf = PRINTBUF; - unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)| - FSCK_CAN_FIX|FSCK_CAN_IGNORE; + bool async_repair = fsck_flags & FSCK_ERR_NO_LOG; + fsck_flags |= FSCK_CAN_FIX|FSCK_CAN_IGNORE; struct bpos bucket = iter->pos; bucket.offset &= ~(~0ULL << 56); @@ -1490,10 +1490,10 @@ delete: } } -static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter) +static int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter) { u8 gen; - int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false); + int ret = __bch2_check_discard_freespace_key(trans, iter, &gen, 0); return ret < 0 ? ret : 0; } @@ -1651,7 +1651,7 @@ bkey_err: ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_prefetch, k, - bch2_check_discard_freespace_key_fsck(trans, &iter)); + bch2_check_discard_freespace_key(trans, &iter)); if (ret) goto err; @@ -1664,7 +1664,7 @@ bkey_err: break; ret = bkey_err(k) ?: - bch2_check_discard_freespace_key_fsck(trans, &iter); + bch2_check_discard_freespace_key(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 0cc5adc5..c2e8482f 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -309,7 +309,14 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); -int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool); +int __bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, + enum bch_fsck_flags); + +static inline int bch2_check_discard_freespace_key_async(struct btree_trans *trans, struct btree_iter *iter, u8 *gen) +{ + return __bch2_check_discard_freespace_key(trans, iter, gen, FSCK_ERR_NO_LOG); +} + int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_dev_do_discards(struct bch_dev *); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index b375ad61..23a9fbb3 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -269,7 +269,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, return NULL; u8 gen; - int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); + int ret = bch2_check_discard_freespace_key_async(trans, freespace_iter, &gen); if (ret < 0) return ERR_PTR(ret); if (ret) diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index e76809e7..77d93beb 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -353,7 +353,7 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit); - if (b == ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node))) + if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) return bkey_s_c_null; if (IS_ERR_OR_NULL(b)) return ((struct bkey_s_c) { .k = ERR_CAST(b) }); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index ac99a8ec..fb3156ed 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -819,6 +819,7 @@ struct bch_fs { struct work_struct read_only_work; struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; + struct bch_devs_mask devs_removed; struct bch_accounting_mem accounting; @@ -863,9 +864,7 @@ struct bch_fs { DARRAY(enum bcachefs_metadata_version) incompat_versions_requested; -#if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *cf_encoding; -#endif struct bch_sb_handle disk_sb; @@ -1285,4 +1284,13 @@ static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca : ca->mi.discard; } +static inline int bch2_fs_casefold_enabled(struct bch_fs *c) +{ + if (!IS_ENABLED(CONFIG_UNICODE)) + return bch_err_throw(c, no_casefolding_without_utf8); + if (c->opts.casefold_disabled) + return bch_err_throw(c, casefolding_disabled); + return 0; +} + #endif /* _BCACHEFS_H */ diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 91e0aa79..a3631a90 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -15,6 +15,7 @@ #include <linux/prefetch.h> #include <linux/sched/mm.h> +#include <linux/seq_buf.h> #include <linux/swap.h> const char * const bch2_btree_node_flags[] = { @@ -444,7 +445,8 @@ retry_unlocked: } if (b->hash_val && !ret) - trace_and_count(c, btree_cache_reap, c, b); + trace_btree_node(c, b, btree_cache_reap); + return 0; } @@ -575,6 +577,19 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, return btree_cache_can_free(list); } +static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) +{ + struct btree_cache_list *list = shrink->private_data; + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); + + bch2_btree_cache_to_text(&out, bc); + seq_buf_commit(s, out.pos); +} + void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; @@ -666,6 +681,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->live[0].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; + shrink->to_text = bch2_btree_cache_shrinker_to_text; shrink->seeks = 2; shrink->private_data = &bc->live[0]; shrinker_register(shrink); @@ -676,6 +692,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->live[1].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; + shrink->to_text = bch2_btree_cache_shrinker_to_text; shrink->seeks = 8; shrink->private_data = &bc->live[1]; shrinker_register(shrink); diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index ca3c1b14..3264801c 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -153,4 +153,15 @@ void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btr void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); +#define trace_btree_node(_c, _b, event) \ +do { \ + if (trace_##event##_enabled()) { \ + CLASS(printbuf, buf)(); \ + printbuf_indent_add(&buf, 2); \ + bch2_btree_pos_to_text(&buf, c, b); \ + trace_##event(c, buf.buf); \ + } \ + count_event(c, event); \ +} while (0); + #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 697c6ecc..7269490a 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -534,32 +534,39 @@ fsck_err: return ret; } -static int bch2_check_root(struct btree_trans *trans, enum btree_id i, +static int bch2_check_root(struct btree_trans *trans, enum btree_id btree, bool *reconstructed_root) { struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, i); + struct btree_root *r = bch2_btree_id_root(c, btree); struct printbuf buf = PRINTBUF; int ret = 0; - bch2_btree_id_to_text(&buf, i); + bch2_btree_id_to_text(&buf, btree); if (r->error) { bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); - r->alive = false; - r->error = 0; + ret = bch2_btree_has_scanned_nodes(c, btree); + if (ret < 0) + goto err; - if (!bch2_btree_has_scanned_nodes(c, i)) { + if (!ret) { __fsck_err(trans, - FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), + FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0), btree_root_unreadable_and_scan_found_nothing, "no nodes found for btree %s, continue?", buf.buf); - bch2_btree_root_alloc_fake_trans(trans, i, 0); + + r->alive = false; + r->error = 0; + bch2_btree_root_alloc_fake_trans(trans, btree, 0); } else { - bch2_btree_root_alloc_fake_trans(trans, i, 1); - bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); + r->alive = false; + r->error = 0; + bch2_btree_root_alloc_fake_trans(trans, btree, 1); + + bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX); if (ret) goto err; } @@ -686,7 +693,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (ret) goto out; - if (trans->nr_updates) { + if (bch2_trans_has_updates(trans)) { ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: -BCH_ERR_transaction_restart_nested; goto out; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 08b22bdd..84e302af 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1337,15 +1337,42 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); - scoped_guard(rcu) - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_degraded(b); + /* + * XXX: + * + * We deadlock if too many btree updates require node rewrites while + * we're still in journal replay. + * + * This is because btree node rewrites generate more updates for the + * interior updates (alloc, backpointers), and if those updates touch + * new nodes and generate more rewrites - well, you see the problem. + * + * The biggest cause is that we don't use the btree write buffer (for + * the backpointer updates - this needs some real thought on locking in + * order to fix. + * + * The problem with this workaround (not doing the rewrite for degraded + * nodes in journal replay) is that those degraded nodes persist, and we + * don't want that (this is a real bug when a btree node write completes + * with fewer replicas than we wanted and leaves a degraded node due to + * device _removal_, i.e. the device went away mid write). + * + * It's less of a bug here, but still a problem because we don't yet + * have a way of tracking degraded data - we another index (all + * extents/btree nodes, by replicas entry) in order to fix properly + * (re-replicate degraded data at the earliest possible time). + */ + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) { + scoped_guard(rcu) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); + + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { + set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_degraded(b); + } } - } + } if (!ptr_written) { set_btree_node_need_rewrite(b); @@ -1771,7 +1798,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, struct bio *bio; int ret; - trace_and_count(c, btree_node_read, trans, b); + trace_btree_node(c, b, btree_node_read); if (static_branch_unlikely(&bch2_verify_all_btree_replicas) && !btree_node_read_all_replicas(c, b, sync)) @@ -2505,7 +2532,17 @@ do_write: c->opts.nochanges) goto err; - trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); + if (trace_btree_node_write_enabled()) { + CLASS(printbuf, buf)(); + printbuf_indent_add(&buf, 2); + prt_printf(&buf, "offset %u sectors %u bytes %u\n", + b->written, + sectors_to_write, + bytes_to_write); + bch2_btree_pos_to_text(&buf, c, b); + trace_btree_node_write(c, buf.buf); + } + count_event(c, btree_node_write); wbio = container_of(bio_alloc_bioset(NULL, buf_pages(data, sectors_to_write << 9), diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 96697d5c..74639468 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -645,6 +645,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str trans_for_each_update(trans, i) if (!i->cached && + !i->key_cache_flushing && i->level == b->c.level && i->btree_id == b->c.btree_id && bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && @@ -2189,7 +2190,7 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans, struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = bch2_btree_journal_peek_prev(trans, iter, search_key, - k->k ? k->k->p : path_l(path)->b->key.k.p); + k->k ? k->k->p : path_l(path)->b->data->min_key); if (next_journal) { iter->k = next_journal->k; @@ -2288,6 +2289,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && + !bkey_deleted(k.k) && (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { k = k2; if (bkey_err(k)) { @@ -2580,6 +2582,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, st if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && + !bkey_deleted(k.k) && (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { k = k2; if (bkey_err(k2)) { @@ -2795,7 +2798,7 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_ite struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter) { struct bpos search_key; - struct bkey_s_c k; + struct bkey_s_c k, k2; int ret; bch2_trans_verify_not_unlocked_or_in_restart(trans); @@ -2854,18 +2857,18 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre (k = btree_trans_peek_slot_journal(trans, iter)).k) goto out; + k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); + if (unlikely(!k.k)) + goto out; + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { + !bkey_deleted(k.k) && + (k2 = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { + k = k2; if (!bkey_err(k)) iter->k = *k.k; - /* We're not returning a key from iter->path: */ - goto out; } - k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); - if (unlikely(!k.k)) - goto out; - if (unlikely(k.k->type == KEY_TYPE_whiteout && (iter->flags & BTREE_ITER_filter_snapshots) && !(iter->flags & BTREE_ITER_key_cache_fill))) @@ -3238,32 +3241,30 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long } EBUG_ON(trans->mem); + EBUG_ON(trans->mem_bytes); + EBUG_ON(trans->mem_top); + EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); + + bool lock_dropped = false; + new_mem = allocate_dropping_locks_norelock(trans, lock_dropped, kmalloc(new_bytes, _gfp)); + if (!new_mem) { + new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + trans->used_mempool = true; + } - new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); - - new_mem = kmalloc(new_bytes, GFP_KERNEL); - if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - trans->used_mempool = true; - } - - EBUG_ON(!new_mem); + EBUG_ON(!new_mem); - trans->mem = new_mem; - trans->mem_bytes = new_bytes; + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + if (unlikely(lock_dropped)) { ret = bch2_trans_relock(trans); if (ret) return ERR_PTR(ret); } - trans->mem = new_mem; - trans->mem_bytes = new_bytes; - - p = trans->mem + trans->mem_top; + p = trans->mem; trans->mem_top += size; memset(p, 0, size); return p; @@ -3324,22 +3325,25 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->mem_top = 0; if (unlikely(trans->restarted == BCH_ERR_transaction_restart_mem_realloced)) { - EBUG_ON(!trans->mem || !trans->mem_bytes); unsigned new_bytes = trans->realloc_bytes_required; - void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); - new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); - - EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); - - if (!new_mem) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - trans->used_mempool = true; - kfree(trans->mem); - } - } + EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); + EBUG_ON(!trans->mem); + EBUG_ON(!trans->mem_bytes); + + bool lock_dropped = false; + void *new_mem = allocate_dropping_locks_norelock(trans, lock_dropped, + krealloc(trans->mem, new_bytes, _gfp)); + (void)lock_dropped; + + if (!new_mem) { + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + trans->used_mempool = true; + kfree(trans->mem); + } + + EBUG_ON(!new_mem); + trans->mem = new_mem; trans->mem_bytes = new_bytes; } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 09dd3e52..cc2c6bb6 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -963,6 +963,20 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, _p; \ }) +#define allocate_dropping_locks_norelock(_trans, _lock_dropped, _do) \ +({ \ + gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + typeof(_do) _p = _do; \ + _lock_dropped = false; \ + if (unlikely(!_p)) { \ + bch2_trans_unlock(_trans); \ + _lock_dropped = true; \ + _gfp = GFP_KERNEL; \ + _p = _do; \ + } \ + _p; \ +}) + struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); void bch2_trans_put(struct btree_trans *); diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c index a41fabd0..341d31b3 100644 --- a/libbcachefs/btree_journal_iter.c +++ b/libbcachefs/btree_journal_iter.c @@ -137,12 +137,15 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b struct journal_key *k; BUG_ON(*idx > keys->nr); + + if (!keys->nr) + return NULL; search: if (!*idx) *idx = __bch2_journal_key_search(keys, btree_id, level, pos); while (*idx < keys->nr && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) >= 0) { + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) { (*idx)++; iters++; if (iters == 10) { @@ -151,18 +154,23 @@ search: } } + if (*idx == keys->nr) + --(*idx); + struct bkey_i *ret = NULL; rcu_read_lock(); /* for overwritten_ranges */ - while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + while (true) { + k = idx_to_key(keys, *idx); if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) break; if (k->overwritten) { if (k->overwritten_range) - *idx = rcu_dereference(k->overwritten_range)->start - 1; - else - *idx -= 1; + *idx = rcu_dereference(k->overwritten_range)->start; + if (!*idx) + break; + --(*idx); continue; } @@ -171,6 +179,8 @@ search: break; } + if (!*idx) + break; --(*idx); iters++; if (iters == 10) { @@ -707,6 +717,18 @@ static void __journal_keys_sort(struct journal_keys *keys) keys->nr = dst - keys->data; } +static bool should_rewind_entry(struct bch_fs *c, struct jset_entry *entry) +{ + if (entry->level) + return false; + if (btree_id_is_alloc(entry->btree_id)) + return false; + if (c->opts.journal_rewind_no_extents && + entry->btree_id == BTREE_ID_extents) + return false; + return true; +} + int bch2_journal_keys_sort(struct bch_fs *c) { struct genradix_iter iter; @@ -725,9 +747,8 @@ int bch2_journal_keys_sort(struct bch_fs *c) cond_resched(); vstruct_for_each(&i->j, entry) { - bool rewind = !entry->level && - !btree_id_is_alloc(entry->btree_id) && - le64_to_cpu(i->j.seq) >= rewind_seq; + bool rewind = le64_to_cpu(i->j.seq) >= rewind_seq && + should_rewind_entry(c, entry); if (entry->type != (rewind ? BCH_JSET_ENTRY_overwrite diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index d96188b9..19d1bb80 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -13,6 +13,7 @@ #include "trace.h" #include <linux/sched/mm.h> +#include <linux/seq_buf.h> static inline bool btree_uses_pcpu_readers(enum btree_id id) { @@ -580,6 +581,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, bool kick_reclaim = false; BUG_ON(insert->k.u64s > ck->u64s); + BUG_ON(bkey_deleted(&insert->k)); bkey_copy(ck->k, insert); @@ -815,6 +817,18 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { } +static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) +{ + struct bch_fs *c = shrink->private_data; + struct btree_key_cache *bc = &c->btree_key_cache; + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); + + bch2_btree_key_cache_to_text(&out, bc); + seq_buf_commit(s, out.pos); +} + int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); @@ -839,6 +853,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) bc->shrink = shrink; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; + shrink->to_text = bch2_btree_key_cache_shrinker_to_text; shrink->batch = 1 << 14; shrink->seeks = 0; shrink->private_data = c; diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index 91a51aef..bed2b4b6 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -771,7 +771,7 @@ static inline void __bch2_trans_unlock(struct btree_trans *trans) } static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, - struct get_locks_fail *f, bool trace) + struct get_locks_fail *f, bool trace, ulong ip) { if (!trace) goto out; @@ -796,7 +796,7 @@ static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, st prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); } - trace_trans_restart_relock(trans, _RET_IP_, buf.buf); + trace_trans_restart_relock(trans, ip, buf.buf); printbuf_exit(&buf); } @@ -806,7 +806,7 @@ out: bch2_trans_verify_locks(trans); } -static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) +static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip) { bch2_trans_verify_locks(trans); @@ -825,7 +825,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) if (path->should_be_locked && (ret = btree_path_get_locks(trans, path, false, &f, BCH_ERR_transaction_restart_relock))) { - bch2_trans_relock_fail(trans, path, &f, trace); + bch2_trans_relock_fail(trans, path, &f, trace, ip); return ret; } } @@ -838,12 +838,12 @@ out: int bch2_trans_relock(struct btree_trans *trans) { - return __bch2_trans_relock(trans, true); + return __bch2_trans_relock(trans, true, _RET_IP_); } int bch2_trans_relock_notrace(struct btree_trans *trans) { - return __bch2_trans_relock(trans, false); + return __bch2_trans_relock(trans, false, _RET_IP_); } void bch2_trans_unlock(struct btree_trans *trans) diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c index a3584773..23d8c62e 100644 --- a/libbcachefs/btree_node_scan.c +++ b/libbcachefs/btree_node_scan.c @@ -521,8 +521,12 @@ bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) return false; } -bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) +int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) { + int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + return ret; + struct found_btree_node search = { .btree_id = btree, .level = 0, diff --git a/libbcachefs/btree_node_scan.h b/libbcachefs/btree_node_scan.h index 08687b20..66e6f9ed 100644 --- a/libbcachefs/btree_node_scan.h +++ b/libbcachefs/btree_node_scan.h @@ -4,7 +4,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *); bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); -bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); +int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); void bch2_find_btree_nodes_exit(struct find_btree_nodes *); diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 639ef75b..7fcf248a 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -46,6 +46,9 @@ void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) { #ifdef CONFIG_BCACHEFS_DEBUG + if (i->key_cache_flushing) + return; + struct bch_fs *c = trans->c; struct bkey u; struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u); @@ -337,6 +340,9 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(!bpos_eq(i->k->k.p, path->pos)); BUG_ON(i->cached != path->cached); + BUG_ON(i->cached && + !i->key_cache_already_flushed && + bkey_deleted(&i->k->k));; BUG_ON(i->level != path->level); BUG_ON(i->btree_id != path->btree_id); BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id)); @@ -1015,9 +1021,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (unlikely(ret)) goto out_reset; - if (!trans->nr_updates && - !trans->journal_entries.u64s && - !trans->accounting.u64s) + if (!bch2_trans_has_updates(trans)) goto out_reset; ret = bch2_trans_commit_run_triggers(trans); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 112170fd..76adf756 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -422,14 +422,16 @@ struct btree_insert_entry { u8 sort_order; u8 bkey_type; enum btree_id btree_id:8; - u8 level:4; + u8 level:3; bool cached:1; bool insert_trigger_run:1; bool overwrite_trigger_run:1; bool key_cache_already_flushed:1; + bool key_cache_flushing:1; /* - * @old_k may be a key from the journal; @old_btree_u64s always refers - * to the size of the key being overwritten in the btree: + * @old_k may be a key from the journal or the key cache; + * @old_btree_u64s always refers to the size of the key being + * overwritten in the btree: */ u8 old_btree_u64s; btree_path_idx_t path; diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 192c1e5e..5d9e0237 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -325,47 +325,11 @@ err: return ret; } -static noinline int flush_new_cached_update(struct btree_trans *trans, - struct btree_insert_entry *i, - enum btree_iter_update_trigger_flags flags, - unsigned long ip) -{ - struct bkey k; - int ret; - - btree_path_idx_t path_idx = - bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, - BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, path_idx, 0); - if (ret) - goto out; - - struct btree_path *btree_path = trans->paths + path_idx; - - /* - * The old key in the insert entry might actually refer to an existing - * key in the btree that has been deleted from cache and not yet - * flushed. Check for this and skip the flush so we don't run triggers - * against a stale key. - */ - bch2_btree_path_peek_slot_exact(btree_path, &k); - if (!bkey_deleted(&k)) - goto out; - - i->key_cache_already_flushed = true; - i->flags |= BTREE_TRIGGER_norun; - - btree_path_set_should_be_locked(trans, btree_path); - ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); -out: - bch2_path_put(trans, path_idx, true); - return ret; -} - -static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags, - unsigned long ip) +static inline struct btree_insert_entry * +__btree_trans_update_by_path(struct btree_trans *trans, + btree_path_idx_t path_idx, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags, + unsigned long ip) { struct bch_fs *c = trans->c; struct btree_insert_entry *i, n; @@ -436,6 +400,58 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, __btree_path_get(trans, trans->paths + i->path, true); trace_update_by_path(trans, path, i, overwrite); + return i; +} + +static noinline int flush_new_cached_update(struct btree_trans *trans, + struct btree_insert_entry *i, + enum btree_iter_update_trigger_flags flags, + unsigned long ip) +{ + btree_path_idx_t path_idx = + bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, + BTREE_ITER_intent, _THIS_IP_); + int ret = bch2_btree_path_traverse(trans, path_idx, 0); + if (ret) + goto out; + + struct btree_path *btree_path = trans->paths + path_idx; + + btree_path_set_should_be_locked(trans, btree_path); +#if 0 + /* + * The old key in the insert entry might actually refer to an existing + * key in the btree that has been deleted from cache and not yet + * flushed. Check for this and skip the flush so we don't run triggers + * against a stale key. + */ + struct bkey k; + bch2_btree_path_peek_slot_exact(btree_path, &k); + if (!bkey_deleted(&k)) + goto out; +#endif + i->key_cache_already_flushed = true; + i->flags |= BTREE_TRIGGER_norun; + + struct bkey old_k = i->old_k; + const struct bch_val *old_v = i->old_v; + + i = __btree_trans_update_by_path(trans, path_idx, i->k, flags, _THIS_IP_); + + i->old_k = old_k; + i->old_v = old_v; + i->key_cache_flushing = true; +out: + bch2_path_put(trans, path_idx, true); + return ret; +} + +static int __must_check +bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags, + unsigned long ip) +{ + struct btree_insert_entry *i = __btree_trans_update_by_path(trans, path_idx, k, flags, ip); /* * If a key is present in the key cache, it must also exist in the @@ -444,10 +460,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, * the key cache - but the key has to exist in the btree for that to * work: */ - if (path->cached && !i->old_btree_u64s) - return flush_new_cached_update(trans, i, flags, ip); - - return 0; + return i->cached && (!i->old_btree_u64s || bkey_deleted(&k->k)) + ? flush_new_cached_update(trans, i, flags, ip) + : 0; } static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, @@ -566,7 +581,7 @@ void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, if (buf->u64s) memcpy(n, btree_trans_subbuf_base(trans, buf), - buf->size * sizeof(u64)); + buf->u64s * sizeof(u64)); buf->base = (u64 *) n - (u64 *) trans->mem; buf->size = new_size; diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index e4b6e7d5..2c6f9b44 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -184,8 +184,7 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); -int bch2_btree_write_buffer_insert_err(struct btree_trans *, - enum btree_id, struct bkey_i *); +int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *); static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, enum btree_id btree, @@ -196,7 +195,7 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); if (unlikely(!btree_type_uses_write_buffer(btree))) { - int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); + int ret = bch2_btree_write_buffer_insert_err(trans->c, btree, k); dump_stack(); return ret; } @@ -272,6 +271,13 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) +static inline bool bch2_trans_has_updates(struct btree_trans *trans) +{ + return trans->nr_updates || + trans->journal_entries.u64s || + trans->accounting.u64s; +} + static inline void bch2_trans_reset_updates(struct btree_trans *trans) { trans_for_each_update(trans, i) diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index d9ac09fa..8e3d3db2 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -217,7 +217,7 @@ static void __btree_node_free(struct btree_trans *trans, struct btree *b) { struct bch_fs *c = trans->c; - trace_and_count(c, btree_node_free, trans, b); + trace_btree_node(c, b, btree_node_free); BUG_ON(btree_node_write_blocked(b)); BUG_ON(btree_node_dirty(b)); @@ -406,7 +406,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); BUG_ON(ret); - trace_and_count(c, btree_node_alloc, trans, b); + trace_btree_node(c, b, btree_node_alloc); bch2_increment_clock(c, btree_sectors(c), WRITE); return b; } @@ -1278,10 +1278,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, do { ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); - + if (!bch2_err_matches(ret, BCH_ERR_operation_blocked)) + break; bch2_trans_unlock(trans); bch2_wait_on_allocator(c, &cl); - } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); + } while (1); } if (ret) { @@ -1330,7 +1331,7 @@ static int bch2_btree_set_root(struct btree_update *as, { struct bch_fs *c = as->c; - trace_and_count(c, btree_node_set_root, trans, b); + trace_btree_node(c, b, btree_node_set_root); struct btree *old = btree_node_root(c, b); @@ -1640,7 +1641,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { struct btree *n[2]; - trace_and_count(c, btree_node_split, trans, b); + trace_btree_node(c, b, btree_node_split); n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); @@ -1702,7 +1703,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, goto err; } } else { - trace_and_count(c, btree_node_compact, trans, b); + trace_btree_node(c, b, btree_node_compact); n1 = bch2_btree_node_alloc_replacement(as, trans, b); @@ -2118,7 +2119,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, as->node_start = prev->data->min_key; as->node_end = next->data->max_key; - trace_and_count(c, btree_node_merge, trans, b); + trace_btree_node(c, b, btree_node_merge); n = bch2_btree_node_alloc(as, trans, b->c.level); @@ -2250,8 +2251,6 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + new_path, n); - trace_and_count(c, btree_node_rewrite, trans, b); - if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); @@ -2262,6 +2261,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (ret) goto err; + trace_btree_node(c, b, btree_node_rewrite); + bch2_btree_interior_update_will_free_node(as, b); bch2_btree_update_get_open_buckets(as, n); diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index 21b5c03d..4b095235 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -267,10 +267,9 @@ out: BUG_ON(wb->sorted.size < wb->flushing.keys.nr); } -int bch2_btree_write_buffer_insert_err(struct btree_trans *trans, +int bch2_btree_write_buffer_insert_err(struct bch_fs *c, enum btree_id btree, struct bkey_i *k) { - struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; prt_printf(&buf, "attempting to do write buffer update on non wb btree="); @@ -332,7 +331,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; if (unlikely(!btree_type_uses_write_buffer(k->btree))) { - ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k); + ret = bch2_btree_write_buffer_insert_err(trans->c, k->btree, &k->k); goto err; } diff --git a/libbcachefs/btree_write_buffer.h b/libbcachefs/btree_write_buffer.h index 05f56fd1..c351d21a 100644 --- a/libbcachefs/btree_write_buffer.h +++ b/libbcachefs/btree_write_buffer.h @@ -89,6 +89,12 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c, struct journal_keys_to_wb *dst, enum btree_id btree, struct bkey_i *k) { + if (unlikely(!btree_type_uses_write_buffer(btree))) { + int ret = bch2_btree_write_buffer_insert_err(c, btree, k); + dump_stack(); + return ret; + } + EBUG_ON(!dst->seq); return k->k.type == KEY_TYPE_accounting diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index e848e210..3968f3be 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -783,6 +783,9 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) darray_for_each(m->op.devs_have, i) __clear_bit(*i, devs.d); + CLASS(printbuf, buf)(); + buf.atomic++; + guard(rcu)(); unsigned nr_replicas = 0, i; @@ -794,7 +797,11 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) struct bch_dev_usage usage; bch2_dev_usage_read_fast(ca, &usage); - if (!dev_buckets_free(ca, usage, m->op.watermark)) + u64 nr_free = dev_buckets_free(ca, usage, m->op.watermark); + + prt_printf(&buf, "%s=%llu ", ca->name, nr_free); + + if (!nr_free) continue; nr_replicas += ca->mi.durability; @@ -802,8 +809,10 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) break; } - if (!nr_replicas) + if (!nr_replicas) { + trace_data_update_done_no_rw_devs(c, buf.buf); return bch_err_throw(c, data_update_done_no_rw_devs); + } if (nr_replicas < m->op.nr_replicas) return bch_err_throw(c, insufficient_devices); return 0; diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 308de4b2..ccbb0127 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -18,9 +18,12 @@ int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, { *out_cf = (struct qstr) QSTR_INIT(NULL, 0); -#if IS_ENABLED(CONFIG_UNICODE) + int ret = bch2_fs_casefold_enabled(trans->c); + if (ret) + return ret; + unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); - int ret = PTR_ERR_OR_ZERO(buf); + ret = PTR_ERR_OR_ZERO(buf); if (ret) return ret; @@ -30,9 +33,6 @@ int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, *out_cf = (struct qstr) QSTR_INIT(buf, ret); return 0; -#else - return bch_err_throw(trans->c, no_casefolding_without_utf8); -#endif } static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) @@ -252,7 +252,10 @@ int bch2_dirent_init_name(struct bch_fs *c, offsetof(struct bch_dirent, d_name) - name->len); } else { -#if IS_ENABLED(CONFIG_UNICODE) + int ret = bch2_fs_casefold_enabled(c); + if (ret) + return ret; + memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len]; @@ -278,9 +281,6 @@ int bch2_dirent_init_name(struct bch_fs *c, dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len); EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len); -#else - return bch_err_throw(c, no_casefolding_without_utf8); -#endif } unsigned u64s = dirent_val_u64s(name->len, cf_len); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 543dbba9..687c3ba9 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -1683,7 +1683,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, return ERR_PTR(ret); if (test_bit(BCH_FS_going_ro, &c->flags)) { - h = ERR_PTR(-BCH_ERR_erofs_no_writes); + h = ERR_PTR(bch_err_throw(c, erofs_no_writes)); goto err; } @@ -1702,7 +1702,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); if (!h) { - h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc); + h = ERR_PTR(bch_err_throw(c, ENOMEM_stripe_head_alloc)); goto err; } found: diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 3118449d..d27b94a6 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -218,6 +218,7 @@ x(EINVAL, option_negative) \ x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EOPNOTSUPP, no_casefolding_without_utf8) \ + x(EOPNOTSUPP, casefolding_disabled) \ x(EOPNOTSUPP, casefold_opt_is_dir_only) \ x(EOPNOTSUPP, unsupported_fsx_flag) \ x(EOPNOTSUPP, unsupported_fa_flag) \ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index b2a6c041..a9a9fe19 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -401,7 +401,8 @@ int bch2_fsck_err_opt(struct bch_fs *c, if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) flags |= fsck_flags_extra[err]; - if (test_bit(BCH_FS_in_fsck, &c->flags)) { + if (test_bit(BCH_FS_in_fsck, &c->flags) || + test_bit(BCH_FS_in_recovery, &c->flags)) { if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) return bch_err_throw(c, fsck_repair_unimplemented); @@ -472,10 +473,13 @@ int __bch2_fsck_err(struct bch_fs *c, !trans && bch2_current_has_btree_trans(c)); - if (test_bit(err, c->sb.errors_silent)) - return flags & FSCK_CAN_FIX + if ((flags & FSCK_ERR_SILENT) || + test_bit(err, c->sb.errors_silent)) { + ret = flags & FSCK_CAN_FIX ? bch_err_throw(c, fsck_fix) : bch_err_throw(c, fsck_ignore); + goto err; + } printbuf_indent_add_nextline(out, 2); @@ -620,14 +624,14 @@ print: if (s) s->ret = ret; - +err_unlock: + mutex_unlock(&c->fsck_error_msgs_lock); +err: if (trans && !(flags & FSCK_ERR_NO_LOG) && ret == -BCH_ERR_fsck_fix) ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret; -err_unlock: - mutex_unlock(&c->fsck_error_msgs_lock); -err: + /* * We don't yet track whether the filesystem currently has errors, for * log_fsck_err()s: that would require us to track for every error type diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index 66bacdd4..dad48d44 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -145,7 +145,7 @@ static int readpage_bio_extend(struct btree_trans *trans, BUG_ON(folio_sector(folio) != bio_end_sector(bio)); - BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); + bio_add_folio_nofail(bio, folio, folio_size(folio), 0); } return bch2_trans_relock(trans); @@ -311,7 +311,7 @@ void bch2_readahead(struct readahead_control *ractl) readpage_iter_advance(&readpages_iter); rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + bio_add_folio_nofail(&rbio->bio, folio, folio_size(folio), 0); bchfs_read(trans, rbio, inode_inum(inode), &readpages_iter); @@ -354,7 +354,7 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) rbio->bio.bi_private = &done; rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + bio_add_folio_nofail(&rbio->bio, folio, folio_size(folio), 0); blk_start_plug(&plug); bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); @@ -639,8 +639,8 @@ do_io: atomic_inc(&s->write_count); BUG_ON(inode != w->io->inode); - BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, - sectors << 9, offset << 9)); + bio_add_folio_nofail(&w->io->op.wbio.bio, folio, + sectors << 9, offset << 9); w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index df42d58d..3b0783f1 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -722,7 +722,6 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, if (IS_ERR(inode)) inode = NULL; -#if IS_ENABLED(CONFIG_UNICODE) if (!inode && IS_CASEFOLDED(vdir)) { /* * Do not cache a negative dentry in casefolded directories @@ -737,7 +736,6 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, */ return NULL; } -#endif return d_splice_alias(&inode->v, dentry); } @@ -1694,11 +1692,15 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, s.mask = map_defined(bch_flags_to_xflags); s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); - if (fa->fsx_xflags) - return bch_err_throw(c, unsupported_fsx_flag); + if (fa->fsx_xflags) { + ret = bch_err_throw(c, unsupported_fsx_flag); + goto err; + } - if (fa->fsx_projid >= U32_MAX) - return bch_err_throw(c, projid_too_big); + if (fa->fsx_projid >= U32_MAX) { + ret = bch_err_throw(c, projid_too_big); + goto err; + } /* * inode fields accessible via the xattr interface are stored with a +1 @@ -1720,8 +1722,10 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, fa->flags &= ~FS_CASEFOLD_FL; s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); - if (fa->flags) - return bch_err_throw(c, unsupported_fa_flag); + if (fa->flags) { + ret = bch_err_throw(c, unsupported_fa_flag); + goto err; + } } mutex_lock(&inode->ei_update_lock); @@ -1732,7 +1736,8 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); - return ret; +err: + return bch2_err_class(ret); } static const struct file_operations bch_file_operations = { @@ -2565,9 +2570,10 @@ got_sb: sb->s_shrink->seeks = 0; #if IS_ENABLED(CONFIG_UNICODE) - sb->s_encoding = c->cf_encoding; -#endif + if (!bch2_fs_casefold_enabled(c)) + sb->s_encoding = c->cf_encoding; generic_set_sb_d_ops(sb); +#endif vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 4ceb28a6..1ceca63c 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -12,6 +12,7 @@ #include "fs.h" #include "fsck.h" #include "inode.h" +#include "io_misc.h" #include "keylist.h" #include "namei.h" #include "recovery_passes.h" @@ -1500,6 +1501,10 @@ static int check_key_has_inode(struct btree_trans *trans, SPOS(k.k->p.inode, 0, k.k->p.snapshot), POS(k.k->p.inode, U64_MAX), 0, k2, ret) { + if (k.k->type == KEY_TYPE_error || + k.k->type == KEY_TYPE_hash_whiteout) + continue; + nr_keys++; if (nr_keys <= 10) { bch2_bkey_val_to_text(&buf, c, k2); @@ -1512,9 +1517,11 @@ static int check_key_has_inode(struct btree_trans *trans, if (ret) goto err; + unsigned reconstruct_limit = iter->btree_id == BTREE_ID_extents ? 3 : 0; + if (nr_keys > 100) prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys); - else if (nr_keys > 10) + else if (nr_keys > reconstruct_limit) prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys); if (!have_inode) { @@ -1572,6 +1579,44 @@ reconstruct: goto out; } +static int maybe_reconstruct_inum_btree(struct btree_trans *trans, + u64 inum, u32 snapshot, + enum btree_id btree) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_max_norestart(trans, iter, btree, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ret) { + ret = 1; + break; + } + bch2_trans_iter_exit(trans, &iter); + + if (ret <= 0) + return ret; + + if (fsck_err(trans, missing_inode_with_contents, + "inode %llu:%u type %s missing, but contents found: reconstruct?", + inum, snapshot, + btree == BTREE_ID_extents ? "reg" : "dir")) + return reconstruct_inode(trans, btree, snapshot, inum) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(trans->c, transaction_restart_commit); +fsck_err: + return ret; +} + +static int maybe_reconstruct_inum(struct btree_trans *trans, + u64 inum, u32 snapshot) +{ + return maybe_reconstruct_inum_btree(trans, inum, snapshot, BTREE_ID_extents) ?: + maybe_reconstruct_inum_btree(trans, inum, snapshot, BTREE_ID_dirents); +} + static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; @@ -1919,33 +1964,11 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, "extent type past end of inode %llu:%u, i_size %llu\n%s", i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct bkey_i *whiteout = bch2_trans_kmalloc(trans, sizeof(*whiteout)); - ret = PTR_ERR_OR_ZERO(whiteout); - if (ret) - goto err; - - bkey_init(&whiteout->k); - whiteout->k.p = SPOS(k.k->p.inode, - last_block, - i->inode.bi_snapshot); - bch2_key_resize(&whiteout->k, - min(KEY_SIZE_MAX & (~0 << c->block_bits), - U64_MAX - whiteout->k.p.offset)); - - - /* - * Need a normal (not BTREE_ITER_all_snapshots) - * iterator, if we're deleting in a different - * snapshot and need to emit a whiteout - */ - struct btree_iter iter2; - bch2_trans_iter_init(trans, &iter2, BTREE_ID_extents, - bkey_start_pos(&whiteout->k), - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &iter2) ?: - bch2_trans_update(trans, &iter2, whiteout, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter2); + ret = bch2_fpunch_snapshot(trans, + SPOS(i->inode.bi_inum, + last_block, + i->inode.bi_snapshot), + POS(i->inode.bi_inum, U64_MAX)); if (ret) goto err; @@ -2302,9 +2325,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; -#if IS_ENABLED(CONFIG_UNICODE) hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; -#endif ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k, need_second_pass); @@ -2368,6 +2389,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; + if (!target->inodes.nr) { + ret = maybe_reconstruct_inum(trans, le64_to_cpu(d.v->d_inum), + d.k->p.snapshot); + if (ret) + return ret; + } + if (fsck_err_on(!target->inodes.nr, trans, dirent_to_missing_inode, "dirent points to missing inode:\n%s", @@ -2811,7 +2839,7 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) ret = remove_backpointer(trans, &inode); bch_err_msg(c, ret, "removing dirent"); if (ret) - break; + goto out; ret = reattach_inode(trans, &inode); bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 95f3c0d4..307fb0c9 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -38,7 +38,7 @@ static const char * const bch2_inode_flag_strs[] = { #undef x static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); -static int may_delete_deleted_inum(struct btree_trans *, subvol_inum); +static int may_delete_deleted_inum(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *); static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; @@ -1018,6 +1018,7 @@ int bch2_inode_create(struct btree_trans *trans, u64 start = le64_to_cpu(cursor->v.idx); u64 pos = start; + u64 gen = 0; bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), BTREE_ITER_all_snapshots| @@ -1030,6 +1031,12 @@ again: if (pos < iter->pos.offset) goto found_slot; + if (bch2_snapshot_is_ancestor(trans->c, snapshot, k.k->p.snapshot) && + k.k->type == KEY_TYPE_inode_generation) { + gen = le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); + goto found_slot; + } + /* * We don't need to iterate over keys in every snapshot once * we've found just one: @@ -1064,7 +1071,7 @@ found_slot: } inode_u->bi_inum = k.k->p.offset; - inode_u->bi_generation = le64_to_cpu(cursor->v.gen); + inode_u->bi_generation = max(gen, le64_to_cpu(cursor->v.gen)); cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); return 0; } @@ -1128,10 +1135,11 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = {}; struct bkey_s_c k; + struct bch_inode_unpacked inode; u32 snapshot; int ret; - ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum)); + ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum, &inode)); if (ret) goto err2; @@ -1143,9 +1151,10 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) * XXX: the dirent code ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: - bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: - bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); + ret = (!S_ISDIR(inode.bi_mode) + ? bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) + : bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents)) ?: + bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs); if (ret) goto err2; retry: @@ -1265,8 +1274,12 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, { struct bch_fs *c = trans->c; -#if IS_ENABLED(CONFIG_UNICODE) - int ret = 0; + int ret = bch2_fs_casefold_enabled(c); + if (ret) { + bch_err_ratelimited(c, "Cannot enable casefolding: %s", bch2_err_str(ret)); + return ret; + } + /* Not supported on individual files. */ if (!S_ISDIR(bi->bi_mode)) return bch_err_throw(c, casefold_opt_is_dir_only); @@ -1289,10 +1302,6 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, bi->bi_fields_set |= BIT(Inode_opt_casefold); return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi); -#else - bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); - return bch_err_throw(c, no_casefolding_without_utf8); -#endif } static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) @@ -1317,7 +1326,7 @@ static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum SPOS(inum, 0, snapshot), SPOS(inum, U64_MAX, snapshot), 0, NULL); - } while (ret == -BCH_ERR_transaction_restart_nested); + } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); if (ret) goto err; retry: @@ -1355,7 +1364,7 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - return ret ?: -BCH_ERR_transaction_restart_nested; + return ret ?: bch_err_throw(c, transaction_restart_nested); } /* @@ -1398,12 +1407,12 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) } static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, + struct bch_inode_unpacked *inode, bool from_deleted_inodes) { struct bch_fs *c = trans->c; struct btree_iter inode_iter; struct bkey_s_c k; - struct bch_inode_unpacked inode; struct printbuf buf = PRINTBUF; int ret; @@ -1421,11 +1430,11 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, if (ret) goto out; - ret = bch2_inode_unpack(k, &inode); + ret = bch2_inode_unpack(k, inode); if (ret) goto out; - if (S_ISDIR(inode.bi_mode)) { + if (S_ISDIR(inode->bi_mode)) { ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); if (fsck_err_on(from_deleted_inodes && bch2_err_matches(ret, ENOTEMPTY), @@ -1437,7 +1446,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, goto out; } - ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked); + ret = inode->bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked); if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_not_unlinked, "non-deleted inode %llu:%u in deleted_inodes btree", @@ -1446,7 +1455,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, if (ret) goto out; - ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot) + ret = !(inode->bi_flags & BCH_INODE_has_child_snapshot) ? 0 : bch_err_throw(c, inode_has_child_snapshot); if (fsck_err_on(from_deleted_inodes && ret, @@ -1465,10 +1474,10 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, if (fsck_err(trans, inode_has_child_snapshots_wrong, "inode has_child_snapshots flag wrong (should be set)\n%s", (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &inode), + bch2_inode_unpacked_to_text(&buf, inode), buf.buf))) { - inode.bi_flags |= BCH_INODE_has_child_snapshot; - ret = __bch2_fsck_write_inode(trans, &inode); + inode->bi_flags |= BCH_INODE_has_child_snapshot; + ret = __bch2_fsck_write_inode(trans, inode); if (ret) goto out; } @@ -1504,12 +1513,13 @@ delete: goto out; } -static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum) +static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode) { u32 snapshot; return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: - may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false); + may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), inode, false); } int bch2_delete_dead_inodes(struct bch_fs *c) @@ -1535,7 +1545,8 @@ int bch2_delete_dead_inodes(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - ret = may_delete_deleted_inode(trans, k.k->p, true); + struct bch_inode_unpacked inode; + ret = may_delete_deleted_inode(trans, k.k->p, &inode, true); if (ret > 0) { bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c index bf72b1d2..07023667 100644 --- a/libbcachefs/io_misc.c +++ b/libbcachefs/io_misc.c @@ -135,6 +135,33 @@ err_noprint: return ret; } +/* For fsck */ +int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end) +{ + u32 restart_count = trans->restart_count; + struct bch_fs *c = trans->c; + struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bkey_i delete; + + int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, + start, end, 0, k, + &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + bkey_init(&delete.k); + delete.k.p = iter.pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete); + + bch2_extent_trim_atomic(trans, &iter, &delete) ?: + bch2_trans_update(trans, &iter, &delete, 0); + })); + + bch2_disk_reservation_put(c, &disk_res); + return ret ?: trans_was_restarted(trans, restart_count); +} + /* * Returns -BCH_ERR_transacton_restart if we had to drop locks: */ diff --git a/libbcachefs/io_misc.h b/libbcachefs/io_misc.h index 9cb44a7c..b93e4d4b 100644 --- a/libbcachefs/io_misc.h +++ b/libbcachefs/io_misc.h @@ -5,6 +5,8 @@ int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, u64, struct bch_io_opts, s64 *, struct write_point_specifier); + +int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos); int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, subvol_inum, u64, s64 *); int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index cd184b21..210b6adc 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -37,12 +37,6 @@ module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(read_corrupt_ratio, ""); #endif -static bool bch2_poison_extents_on_checksum_error; -module_param_named(poison_extents_on_checksum_error, - bch2_poison_extents_on_checksum_error, bool, 0644); -MODULE_PARM_DESC(poison_extents_on_checksum_error, - "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); - #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) @@ -484,9 +478,6 @@ static void get_rbio_extent(struct btree_trans *trans, static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, enum btree_id btree, struct bkey_s_c read_k) { - if (!bch2_poison_extents_on_checksum_error) - return 0; - struct bch_fs *c = trans->c; struct data_update *u = rbio_data_update(rbio); @@ -1232,6 +1223,10 @@ retry_pick: async_object_list_add(c, rbio, rbio, &rbio->list_idx); + /* XXX: also nvme read recovery level */ + if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev))) + rbio->bio.bi_opf |= REQ_FUA; + if (rbio->bounce) trace_and_count(c, io_read_bounce, &rbio->bio); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index df71af00..ce534061 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -1283,7 +1283,7 @@ static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca ret = 0; /* wait and retry */ bch2_disk_reservation_put(c, &disk_res); - closure_sync(&cl); + bch2_wait_on_allocator(c, &cl); } return ret; @@ -1376,7 +1376,6 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) return bch_err_throw(c, erofs_filesystem_full); } - unsigned nr; int ret; if (dynamic_fault("bcachefs:add:journal_alloc")) { @@ -1385,16 +1384,19 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) } /* 1/128th of the device by default: */ - nr = ca->mi.nbuckets >> 7; + unsigned nr = ca->mi.nbuckets >> 7; /* - * clamp journal size to 8192 buckets or 8GB (in sectors), whichever - * is smaller: + * clamp journal size to 8GB, or 32GB with large_journal option: */ + unsigned max_sectors = 1 << 24; + + if (c->opts.large_journal) + max_sectors *= 4; + nr = clamp_t(unsigned, nr, BCH_JOURNAL_BUCKETS_MIN, - min(1 << 13, - (1 << 24) / ca->mi.bucket_size)); + max_sectors / ca->mi.bucket_size); ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); err: diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index dd3f3434..3f06c4b2 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1272,6 +1272,28 @@ static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_r printbuf_exit(&buf); } +struct u64_range bch2_journal_entry_missing_range(struct bch_fs *c, u64 start, u64 end) +{ + BUG_ON(start > end); + + if (start == end) + return (struct u64_range) {}; + + start = bch2_journal_seq_next_nonblacklisted(c, start); + if (start >= end) + return (struct u64_range) {}; + + struct u64_range missing = { + .start = start, + .end = min(end, bch2_journal_seq_next_blacklisted(c, start)), + }; + + if (missing.start == missing.end) + return (struct u64_range) {}; + + return missing; +} + noinline_for_stack static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) { @@ -1280,6 +1302,7 @@ static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 e struct genradix_iter radix_iter; struct journal_replay *i, **_i, *prev = NULL; + /* Sequence number we expect to find next, to check for missing entries */ u64 seq = start_seq; genradix_for_each(&c->journal_entries, radix_iter, _i) { @@ -1290,43 +1313,31 @@ static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 e BUG_ON(seq > le64_to_cpu(i->j.seq)); - while (seq < le64_to_cpu(i->j.seq)) { - while (seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (seq == le64_to_cpu(i->j.seq)) - break; - - u64 missing_start = seq; - - while (seq < le64_to_cpu(i->j.seq) && - !bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - u64 missing_end = seq - 1; + struct u64_range missing; + while ((missing = bch2_journal_entry_missing_range(c, seq, le64_to_cpu(i->j.seq))).start) { printbuf_reset(&buf); prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", - missing_start, missing_end, + missing.start, missing.end - 1, start_seq, end_seq); - prt_printf(&buf, "\nprev at "); if (prev) { + prt_printf(&buf, "\n%llu at ", le64_to_cpu(prev->j.seq)); bch2_journal_ptrs_to_text(&buf, c, prev); prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); - } else - prt_printf(&buf, "(none)"); + } - prt_printf(&buf, "\nnext at "); + prt_printf(&buf, "\n%llu at ", le64_to_cpu(i->j.seq)); bch2_journal_ptrs_to_text(&buf, c, i); prt_printf(&buf, ", continue?"); fsck_err(c, journal_entries_missing, "%s", buf.buf); + + seq = missing.end; } prev = i; - seq++; + seq = le64_to_cpu(i->j.seq) + 1; } fsck_err: printbuf_exit(&buf); diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 6fa82c40..f53c5c81 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -71,6 +71,13 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, struct journal_replay *); +struct u64_range { + u64 start; + u64 end; +}; + +struct u64_range bch2_journal_entry_missing_range(struct bch_fs *, u64, u64); + int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); CLOSURE_CALLBACK(bch2_journal_write); diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index af4fe416..6361809b 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -103,6 +103,52 @@ static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) return cmp_int(l->start, r->start); } +static int journal_seq_blacklist_table_end_cmp(const void *_l, const void *_r) +{ + const struct journal_seq_blacklist_table_entry *l = _l; + const struct journal_seq_blacklist_table_entry *r = _r; + + return cmp_int(l->end, r->end); +} + +u64 bch2_journal_seq_next_blacklisted(struct bch_fs *c, u64 seq) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + + if (!t) + return U64_MAX; + + struct journal_seq_blacklist_table_entry search = { .end = seq }; + int idx = eytzinger0_find_gt(t->entries, t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_end_cmp, + &search); + if (idx < 0) + return U64_MAX; + + return max(seq, t->entries[idx].start); +} + +u64 bch2_journal_seq_next_nonblacklisted(struct bch_fs *c, u64 seq) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + + if (!t) + return seq; + + while (true) { + struct journal_seq_blacklist_table_entry search = { .start = seq }; + int idx = eytzinger0_find_le(t->entries, t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + &search); + if (idx < 0 || t->entries[idx].end <= seq) + return seq; + + seq = t->entries[idx].end; + } +} + bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, bool dirty) { diff --git a/libbcachefs/journal_seq_blacklist.h b/libbcachefs/journal_seq_blacklist.h index f06942cc..389b789b 100644 --- a/libbcachefs/journal_seq_blacklist.h +++ b/libbcachefs/journal_seq_blacklist.h @@ -11,6 +11,9 @@ blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) : 0; } +u64 bch2_journal_seq_next_blacklisted(struct bch_fs *, u64); +u64 bch2_journal_seq_next_nonblacklisted(struct bch_fs *, u64); + bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); u64 bch2_journal_last_blacklisted_seq(struct bch_fs *); int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); diff --git a/libbcachefs/namei.c b/libbcachefs/namei.c index c3f87c59..3e2b41ba 100644 --- a/libbcachefs/namei.c +++ b/libbcachefs/namei.c @@ -1027,7 +1027,7 @@ fsck_err: if (repairing_parents) { return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; + bch_err_throw(trans->c, transaction_restart_nested); } return 0; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index b0a76bd6..4a7a6058 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -234,6 +234,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH_SB_CASEFOLD, false, \ NULL, "Dirent lookups are casefolded") \ + x(casefold_disabled, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Disable casefolding filesystem wide") \ x(inodes_32bit, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -338,6 +343,12 @@ enum fsck_err_opts { OPT_UINT(0, U32_MAX), \ BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ NULL, "Delay in milliseconds before automatic journal reclaim")\ + x(large_journal, bool, \ + OPT_FS|OPT_MOUNT|OPT_FORMAT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Allocate a bigger than normal journal: recovery from unclean "\ + "shutdown will be slower, but more info will be available for debugging")\ x(move_bytes_in_flight, u32, \ OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1024, U32_MAX), \ @@ -384,6 +395,11 @@ enum fsck_err_opts { OPT_UINT(0, U64_MAX), \ BCH2_NO_SB_OPT, 0, \ NULL, "Rewind journal") \ + x(journal_rewind_no_extents, bool, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Don't rewind extents when rewinding journal") \ x(recovery_passes, u64, \ OPT_FS|OPT_MOUNT, \ OPT_BITFIELD(bch2_recovery_passes), \ diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 1c345b86..73b463c9 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -220,7 +220,7 @@ int bch2_get_update_rebalance_opts(struct btree_trans *trans, return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; + bch_err_throw(trans->c, transaction_restart_nested); } #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index d0b7e3a3..974f8bf9 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1177,9 +1177,10 @@ int bch2_fs_initialize(struct bch_fs *c) for_each_member_device(c, ca) { m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); - ca->mi = bch2_mi_to_cpu(m); } + bch2_sb_members_to_cpu(c); + bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/libbcachefs/recovery_passes.c b/libbcachefs/recovery_passes.c index c2c18c0a..c09ed2dd 100644 --- a/libbcachefs/recovery_passes.c +++ b/libbcachefs/recovery_passes.c @@ -313,6 +313,9 @@ static bool recovery_pass_needs_set(struct bch_fs *c, */ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); + bool rewind = in_recovery && + r->curr_pass > pass && + !(r->passes_complete & BIT_ULL(pass)); if (persistent ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) @@ -323,6 +326,9 @@ static bool recovery_pass_needs_set(struct bch_fs *c, (r->passes_ratelimiting & BIT_ULL(pass))) return true; + if (rewind) + return true; + return false; } @@ -337,7 +343,6 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, struct bch_fs_recovery *r = &c->recovery; int ret = 0; - lockdep_assert_held(&c->sb_lock); bch2_printbuf_make_room(out, 1024); @@ -408,10 +413,8 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, { int ret = 0; - scoped_guard(mutex, &c->sb_lock) { - if (!recovery_pass_needs_set(c, pass, &flags)) - return 0; - + if (recovery_pass_needs_set(c, pass, &flags)) { + guard(mutex)(&c->sb_lock); ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); bch2_write_super(c); } diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index a535abd4..8d8e045b 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -64,6 +64,9 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad), le32_to_cpu(p.v->back_pad)); + + if (REFLINK_P_ERROR(p.v)) + prt_str(out, " error"); } bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) @@ -164,7 +167,7 @@ static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bk return 0; return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; + bch_err_throw(trans->c, transaction_restart_nested); } static int bch2_indirect_extent_missing_error(struct btree_trans *trans, @@ -239,7 +242,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, if (should_commit) ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; + bch_err_throw(c, transaction_restart_nested); } err: fsck_err: @@ -269,13 +272,12 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, return k; if (unlikely(!bkey_extent_is_reflink_data(k.k))) { - unsigned size = min((u64) k.k->size, - REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - - reflink_offset); - bch2_key_resize(&iter->k, size); + u64 missing_end = min(k.k->p.offset, + REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad)); + BUG_ON(reflink_offset == missing_end); int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, - k.k->p.offset, should_commit); + missing_end, should_commit); if (ret) { bch2_trans_iter_exit(trans, iter); return bkey_s_c_err(ret); diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index bb1eddd6..dd4ee466 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -7,6 +7,7 @@ enum bch_fsck_flags { FSCK_CAN_IGNORE = BIT(1), FSCK_AUTOFIX = BIT(2), FSCK_ERR_NO_LOG = BIT(3), + FSCK_ERR_SILENT = BIT(4), }; #define BCH_SB_ERRS() \ @@ -157,6 +158,7 @@ enum bch_fsck_flags { x(extent_ptrs_unwritten, 140, 0) \ x(extent_ptrs_written_and_unwritten, 141, 0) \ x(ptr_to_invalid_device, 142, 0) \ + x(ptr_to_removed_device, 322, 0) \ x(ptr_to_duplicate_device, 143, 0) \ x(ptr_after_last_bucket, 144, 0) \ x(ptr_before_first_bucket, 145, 0) \ @@ -290,6 +292,7 @@ enum bch_fsck_flags { x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \ x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \ x(inode_bi_parent_nonzero, 251, 0) \ + x(missing_inode_with_contents, 321, FSCK_AUTOFIX) \ x(dirent_to_missing_parent_subvol, 252, 0) \ x(dirent_not_visible_in_parent_subvol, 253, 0) \ x(subvol_fs_path_parent_wrong, 254, 0) \ @@ -301,7 +304,7 @@ enum bch_fsck_flags { x(btree_node_bkey_bad_u64s, 260, 0) \ x(btree_node_topology_empty_interior_node, 261, 0) \ x(btree_ptr_v2_min_key_bad, 262, 0) \ - x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \ + x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ x(snapshot_node_missing, 264, FSCK_AUTOFIX) \ x(dup_backpointer_to_bad_csum_extent, 265, 0) \ x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ @@ -314,7 +317,7 @@ enum bch_fsck_flags { x(accounting_mismatch, 272, FSCK_AUTOFIX) \ x(accounting_replicas_not_marked, 273, 0) \ x(accounting_to_invalid_device, 289, 0) \ - x(invalid_btree_id, 274, 0) \ + x(invalid_btree_id, 274, FSCK_AUTOFIX) \ x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ @@ -331,7 +334,7 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 321, 0) + x(MAX, 323, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c index 6245e342..f2abe92c 100644 --- a/libbcachefs/sb-members.c +++ b/libbcachefs/sb-members.c @@ -15,10 +15,15 @@ int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) struct printbuf buf = PRINTBUF; bch2_log_msg_start(c, &buf); - prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev); + bool removed = test_bit(dev, c->devs_removed.d); + + prt_printf(&buf, "pointer to %s device %u in key\n", + removed ? "removed" : "nonexistent", dev); bch2_bkey_val_to_text(&buf, c, k); - bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); + bool print = removed + ? bch2_count_fsck_err(c, ptr_to_removed_device, &buf) + : bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); int ret = bch2_run_explicit_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_allocations, 0); @@ -32,7 +37,9 @@ int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev) { if (dev != BCH_SB_MEMBER_INVALID) - bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); + bch2_fs_inconsistent(c, "pointer to %s device %u", + test_bit(dev, c->devs_removed.d) + ? "removed" : "nonexistent", dev); } void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket) @@ -413,6 +420,22 @@ void bch2_sb_members_from_cpu(struct bch_fs *c) } } +void bch2_sb_members_to_cpu(struct bch_fs *c) +{ + for_each_member_device(c, ca) { + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); + ca->mi = bch2_mi_to_cpu(&m); + } + + struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(c->disk_sb.sb, members_v2); + if (mi2) + for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_member m = members_v2_get(mi2, i); + bool removed = uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); + mod_bit(i, c->devs_removed.d, removed); + } +} + void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) { struct bch_fs *c = ca->fs; diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h index 5dcc2017..0d363a1c 100644 --- a/libbcachefs/sb-members.h +++ b/libbcachefs/sb-members.h @@ -365,6 +365,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) } void bch2_sb_members_from_cpu(struct bch_fs *); +void bch2_sb_members_to_cpu(struct bch_fs *); void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); void bch2_dev_errors_reset(struct bch_dev *); diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index 38aeaa12..4c43d2a2 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -871,7 +871,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, 0, k, ret) { - if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { + if (k.k->type == KEY_TYPE_snapshot_tree && + le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { tree_id = k.k->p.offset; break; } @@ -899,7 +900,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { - if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { + if (k.k->type == KEY_TYPE_subvolume && + le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { snapshot->v.subvol = cpu_to_le32(k.k->p.offset); SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); break; diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h index 6766bf67..6dcb118b 100644 --- a/libbcachefs/snapshot.h +++ b/libbcachefs/snapshot.h @@ -128,7 +128,7 @@ static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { guard(rcu)(); const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; + return s ? s->children[0] : bch_err_throw(c, invalid_snapshot_node); } static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) diff --git a/libbcachefs/str_hash.c b/libbcachefs/str_hash.c index 3e9f5922..d39fd426 100644 --- a/libbcachefs/str_hash.c +++ b/libbcachefs/str_hash.c @@ -204,7 +204,7 @@ int bch2_repair_inode_hash_info(struct btree_trans *trans, } ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; + bch_err_throw(c, transaction_restart_nested); err: fsck_err: printbuf_exit(&buf); @@ -292,7 +292,7 @@ int bch2_str_hash_repair_key(struct btree_trans *trans, BTREE_UPDATE_internal_snapshot_node) ?: bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_commit; + bch_err_throw(c, transaction_restart_commit); } else { duplicate_entries: ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k); @@ -326,7 +326,7 @@ duplicate_entries: } ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_commit; + bch_err_throw(c, transaction_restart_commit); } out: fsck_err: diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index a49376df..353a9278 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -48,9 +48,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) struct bch_hash_info info = { .inum_snapshot = bi->bi_snapshot, .type = INODE_STR_HASH(bi), -#if IS_ENABLED(CONFIG_UNICODE) .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, -#endif .siphash_key = { .k0 = bi->bi_hash_seed } }; @@ -177,7 +175,7 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, } bch2_trans_iter_exit(trans, iter); - return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup); + return bkey_s_c_err(ret ?: bch_err_throw(trans->c, ENOENT_str_hash_lookup)); } static __always_inline struct bkey_s_c @@ -219,7 +217,7 @@ bch2_hash_hole(struct btree_trans *trans, return 0; bch2_trans_iter_exit(trans, iter); - return ret ?: -BCH_ERR_ENOSPC_str_hash_create; + return ret ?: bch_err_throw(trans->c, ENOSPC_str_hash_create); } static __always_inline diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 02058744..353df662 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -70,7 +70,7 @@ static int check_subvol(struct btree_trans *trans, if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { ret = bch2_subvolume_delete(trans, iter->pos.offset); bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); - return ret ?: -BCH_ERR_transaction_restart_nested; + return ret ?: bch_err_throw(c, transaction_restart_nested); } if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && @@ -310,7 +310,7 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) bch2_trans_iter_exit(trans, &iter); return bkey_err(k) ?: k.k && k.k->p.inode == subvol - ? -BCH_ERR_ENOTEMPTY_subvol_not_empty + ? bch_err_throw(trans->c, ENOTEMPTY_subvol_not_empty) : 0; } diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 6c2e1d64..85e460d1 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -632,10 +632,7 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); } - for_each_member_device(c, ca) { - struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); - ca->mi = bch2_mi_to_cpu(&m); - } + bch2_sb_members_to_cpu(c); } static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) diff --git a/libbcachefs/super.c b/libbcachefs/super.c index b2fcae49..6980cd5b 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -1025,15 +1025,17 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, } #if IS_ENABLED(CONFIG_UNICODE) - /* Default encoding until we can potentially have more as an option. */ - c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); - if (IS_ERR(c->cf_encoding)) { - printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); - ret = -EINVAL; - goto err; + if (!bch2_fs_casefold_enabled(c)) { + /* Default encoding until we can potentially have more as an option. */ + c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); + if (IS_ERR(c->cf_encoding)) { + printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + ret = -EINVAL; + goto err; + } } #else if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { @@ -1160,7 +1162,7 @@ int bch2_fs_start(struct bch_fs *c) print_mount_opts(c); - if (IS_ENABLED(CONFIG_UNICODE)) + if (c->cf_encoding) bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 05848375..50cf5165 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -44,6 +44,7 @@ #include <linux/blkdev.h> #include <linux/sort.h> +#include <linux/string_choices.h> #include <linux/sched/clock.h> #include "util.h" @@ -156,6 +157,7 @@ write_attribute(trigger_recalc_capacity); write_attribute(trigger_delete_dead_snapshots); write_attribute(trigger_emergency_read_only); read_attribute(gc_gens_pos); +__sysfs_attribute(read_fua_test, 0400); read_attribute(uuid); read_attribute(minor); @@ -303,6 +305,116 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes); } +static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bio *bio = NULL; + void *buf = NULL; + unsigned bs = c->opts.block_size, iters; + u64 end, test_duration = NSEC_PER_SEC * 2; + struct bch2_time_stats stats_nofua, stats_fua, stats_random; + int ret = 0; + + bch2_time_stats_init_no_pcpu(&stats_nofua); + bch2_time_stats_init_no_pcpu(&stats_fua); + bch2_time_stats_init_no_pcpu(&stats_random); + + if (!bch2_dev_get_ioref(c, ca->dev_idx, READ, BCH_DEV_READ_REF_read_fua_test)) { + prt_str(out, "offline\n"); + return 0; + } + + struct block_device *bdev = ca->disk_sb.bdev; + + bio = bio_kmalloc(1, GFP_KERNEL); + if (!bio) { + ret = -ENOMEM; + goto err; + } + + buf = kmalloc(bs, GFP_KERNEL); + if (!buf) + goto err; + + end = ktime_get_ns() + test_duration; + for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { + bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); + bch2_bio_map(bio, buf, bs); + + u64 submit_time = ktime_get_ns(); + ret = submit_bio_wait(bio); + bch2_time_stats_update(&stats_nofua, submit_time); + + if (ret) + goto err; + } + + end = ktime_get_ns() + test_duration; + for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { + bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ); + bch2_bio_map(bio, buf, bs); + + u64 submit_time = ktime_get_ns(); + ret = submit_bio_wait(bio); + bch2_time_stats_update(&stats_fua, submit_time); + + if (ret) + goto err; + } + + u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca); + + end = ktime_get_ns() + test_duration; + for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { + bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); + bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9; + bch2_bio_map(bio, buf, bs); + + u64 submit_time = ktime_get_ns(); + ret = submit_bio_wait(bio); + bch2_time_stats_update(&stats_random, submit_time); + + if (ret) + goto err; + } + + u64 ns_nofua = mean_and_variance_get_mean(stats_nofua.duration_stats); + u64 ns_fua = mean_and_variance_get_mean(stats_fua.duration_stats); + u64 ns_rand = mean_and_variance_get_mean(stats_random.duration_stats); + + u64 stddev_nofua = mean_and_variance_get_stddev(stats_nofua.duration_stats); + u64 stddev_fua = mean_and_variance_get_stddev(stats_fua.duration_stats); + u64 stddev_rand = mean_and_variance_get_stddev(stats_random.duration_stats); + + printbuf_tabstop_push(out, 8); + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 12); + prt_printf(out, "This test must be run on an idle drive for accurate results\n"); + prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device)); + prt_printf(out, "fua support advertized: %s\n", str_yes_no(bdev_fua(bdev))); + prt_newline(out); + prt_printf(out, "ns:\tlatency\rstddev\r\n"); + prt_printf(out, "nofua\t%llu\r%llu\r\n", ns_nofua, stddev_nofua); + prt_printf(out, "fua\t%llu\r%llu\r\n", ns_fua, stddev_fua); + prt_printf(out, "random\t%llu\r%llu\r\n", ns_rand, stddev_rand); + + bool read_cache = ns_nofua * 2 < ns_rand; + bool fua_cached = read_cache && ns_fua < (ns_nofua + ns_rand) / 2; + + if (!read_cache) + prt_str(out, "reads don't appear to be cached - safe\n"); + else if (!fua_cached) + prt_str(out, "fua reads don't appear to be cached - safe\n"); + else + prt_str(out, "fua reads appear to be cached - unsafe\n"); +err: + kfree(buf); + kfree(bio); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_read_fua_test); + bch_err_fn(c, ret); + return ret; +} + SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -840,6 +952,9 @@ SHOW(bch2_dev) if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, ca); + if (attr == &sysfs_read_fua_test) + return bch2_read_fua_test(out, ca); + int opt_id = bch2_opt_lookup(attr->name); if (opt_id >= 0) return sysfs_opt_show(c, ca, opt_id, out); @@ -902,6 +1017,8 @@ struct attribute *bch2_dev_files[] = { &sysfs_io_latency_stats_write, &sysfs_congested, + &sysfs_read_fua_test, + /* debug: */ &sysfs_alloc_debug, &sysfs_open_buckets, diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 9c5a9c55..9324ef32 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -92,58 +92,6 @@ DECLARE_EVENT_CLASS(trans_str_nocaller, __entry->trans_fn, __get_str(str)) ); -DECLARE_EVENT_CLASS(btree_node_nofs, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u8, level ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->level = b->c.level; - __entry->btree_id = b->c.btree_id; - TRACE_BPOS_assign(pos, b->key.k.p); - ), - - TP_printk("%d,%d %u %s %llu:%llu:%u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->level, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) -); - -DECLARE_EVENT_CLASS(btree_node, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - __field(u8, level ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->level = b->c.level; - __entry->btree_id = b->c.btree_id; - TRACE_BPOS_assign(pos, b->key.k.p); - ), - - TP_printk("%d,%d %s %u %s %llu:%llu:%u", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, - __entry->level, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) -); - DECLARE_EVENT_CLASS(bch_fs, TP_PROTO(struct bch_fs *c), TP_ARGS(c), @@ -527,9 +475,9 @@ TRACE_EVENT(btree_cache_scan, __entry->nr_to_scan, __entry->can_free, __entry->ret) ); -DEFINE_EVENT(btree_node_nofs, btree_cache_reap, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) +DEFINE_EVENT(fs_str, btree_cache_reap, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail, @@ -554,39 +502,24 @@ DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock, /* Btree */ -DEFINE_EVENT(btree_node, btree_node_read, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_read, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -TRACE_EVENT(btree_node_write, - TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), - TP_ARGS(b, bytes, sectors), - - TP_STRUCT__entry( - __field(enum btree_node_type, type) - __field(unsigned, bytes ) - __field(unsigned, sectors ) - ), - - TP_fast_assign( - __entry->type = btree_node_type(b); - __entry->bytes = bytes; - __entry->sectors = sectors; - ), - - TP_printk("bkey type %u bytes %u sectors %u", - __entry->type , __entry->bytes, __entry->sectors) +DEFINE_EVENT(fs_str, btree_node_write, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(btree_node, btree_node_alloc, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_alloc, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(btree_node, btree_node_free, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_free, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); TRACE_EVENT(btree_reserve_get_fail, @@ -617,29 +550,29 @@ TRACE_EVENT(btree_reserve_get_fail, __entry->ret) ); -DEFINE_EVENT(btree_node, btree_node_compact, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_set_root, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(btree_node, btree_node_merge, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_rewrite, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(btree_node, btree_node_split, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_merge, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(btree_node, btree_node_rewrite, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_compact, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(btree_node, btree_node_set_root, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_split, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); TRACE_EVENT(btree_path_relock_fail, @@ -1397,6 +1330,11 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); +DEFINE_EVENT(fs_str, data_update_done_no_rw_devs, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + DEFINE_EVENT(fs_str, io_move_pred, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) diff --git a/libbcachefs/util.c b/libbcachefs/util.c index df9a6071..7a4436fd 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -299,17 +299,12 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigne if (ret) return ret; - if (!down_read_trylock(&task->signal->exec_update_lock)) - return -1; - do { nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); } while (nr_entries == stack->size && !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp))); stack->nr = nr_entries; - up_read(&task->signal->exec_update_lock); - return ret; #else return 0; @@ -617,17 +612,10 @@ void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_contro void bch2_bio_map(struct bio *bio, void *base, size_t size) { - while (size) { - struct page *page = is_vmalloc_addr(base) - ? vmalloc_to_page(base) - : virt_to_page(base); - unsigned offset = offset_in_page(base); - unsigned len = min_t(size_t, PAGE_SIZE - offset, size); - - BUG_ON(!bio_add_page(bio, page, len, offset)); - size -= len; - base += len; - } + if (is_vmalloc_addr(base)) + bio_add_vmalloc(bio, base, size); + else + bio_add_virt_nofail(bio, base, size); } int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) |