diff options
Diffstat (limited to 'libbcachefs/btree_update_interior.c')
-rw-r--r-- | libbcachefs/btree_update_interior.c | 203 |
1 files changed, 142 insertions, 61 deletions
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 642213ef..8d74dfcb 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_gc.h" @@ -18,6 +19,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery_passes.h" #include "replicas.h" #include "super-io.h" #include "trace.h" @@ -44,56 +46,103 @@ static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, return path_idx; } -/* Debug code: */ - /* * Verify that child nodes correctly span parent node's range: */ -static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) +int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) { -#ifdef CONFIG_BCACHEFS_DEBUG - struct bpos next_node = b->data->min_key; - struct btree_node_iter iter; + struct bch_fs *c = trans->c; + struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key + : b->data->min_key; + struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_s_c_btree_ptr_v2 bp; - struct bkey unpacked; - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; + struct bkey_buf prev; + int ret = 0; - BUG_ON(!b->c.level); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; + if (!b->c.level) + return 0; - bch2_btree_node_iter_init_from_start(&iter, b); + bch2_bkey_buf_init(&prev); + bkey_init(&prev.k->k); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - while (1) { - k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { if (k.k->type != KEY_TYPE_btree_ptr_v2) - break; - bp = bkey_s_c_to_btree_ptr_v2(k); + goto out; - if (!bpos_eq(next_node, bp.v->min_key)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, next_node); - bch2_bpos_to_text(&buf2, bp.v->min_key); - panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); - } + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - bch2_btree_node_iter_advance(&iter, b); + struct bpos expected_min = bkey_deleted(&prev.k->k) + ? node_min + : bpos_successor(prev.k->k.p); - if (bch2_btree_node_iter_end(&iter)) { - if (!bpos_eq(k.k->p, b->key.k.p)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, b->key.k.p); - bch2_bpos_to_text(&buf2, k.k->p); - panic("expected end %s got %s\n", buf1.buf, buf2.buf); - } - break; + if (!bpos_eq(expected_min, bp.v->min_key)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "end of prev node doesn't match start of next node\n"), + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n prev "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_str(&buf, "\n next "); + bch2_bkey_val_to_text(&buf, c, k); + + need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf); + goto topology_repair; } - next_node = bpos_successor(k.k->p); + bch2_bkey_buf_reassemble(&prev, c, k); + bch2_btree_and_journal_iter_advance(&iter); + } + + if (bkey_deleted(&prev.k->k)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "empty interior node\n"); + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf); + goto topology_repair; + } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "last child node doesn't end at end of parent node\n"); + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n last key "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + + need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf); + goto topology_repair; + } +out: +fsck_err: + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev, c); + printbuf_exit(&buf); + return ret; +topology_repair: + if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { + bch2_inconsistent_error(c); + ret = -BCH_ERR_btree_need_topology_repair; + } else { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); } -#endif + goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -646,7 +695,7 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_trans_unlock(trans); bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, - "%s(): error %s", __func__, bch2_err_str(ret)); + "%s", bch2_err_str(ret)); err: if (as->b) { @@ -1067,13 +1116,18 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < c->journal.watermark) { + if (watermark < c->journal.watermark) { struct journal_res res = { 0 }; + unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK; + + if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && + watermark != BCH_WATERMARK_reclaim) + journal_flags |= JOURNAL_RES_GET_NONBLOCK; ret = drop_locks_do(trans, - bch2_journal_res_get(&c->journal, &res, 1, - watermark|JOURNAL_RES_GET_CHECK)); + bch2_journal_res_get(&c->journal, &res, 1, journal_flags)); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + ret = -BCH_ERR_journal_reclaim_would_deadlock; if (ret) return ERR_PTR(ret); } @@ -1117,6 +1171,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, closure_init(&as->cl, NULL); as->c = c; as->start_time = start_time; + as->ip_started = _RET_IP_; as->mode = BTREE_INTERIOR_NO_UPDATE; as->took_gc_lock = true; as->btree_id = path->btree_id; @@ -1192,7 +1247,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, err: bch2_btree_update_free(as, trans); if (!bch2_err_matches(ret, ENOSPC) && - !bch2_err_matches(ret, EROFS)) + !bch2_err_matches(ret, EROFS) && + ret != -BCH_ERR_journal_reclaim_would_deadlock) bch_err_fn_ratelimited(c, ret); return ERR_PTR(ret); } @@ -1373,9 +1429,16 @@ static void __btree_split_node(struct btree_update *as, if (bkey_deleted(k)) continue; + uk = bkey_unpack_key(b, k); + + if (b->c.level && + u64s < n1_u64s && + u64s + k->u64s >= n1_u64s && + bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p)) + n1_u64s += k->u64s; + i = u64s >= n1_u64s; u64s += k->u64s; - uk = bkey_unpack_key(b, k); if (!i) n1_pos = uk.p; bch2_bkey_format_add_key(&format[i], &uk); @@ -1434,8 +1497,7 @@ static void __btree_split_node(struct btree_update *as, bch2_verify_btree_nr_keys(n[i]); - if (b->c.level) - btree_node_interior_verify(as->c, n[i]); + BUG_ON(bch2_btree_node_check_topology(trans, n[i])); } } @@ -1466,7 +1528,7 @@ static void btree_split_insert_keys(struct btree_update *as, __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); - btree_node_interior_verify(as->c, b); + BUG_ON(bch2_btree_node_check_topology(trans, b)); } } @@ -1481,9 +1543,14 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; + bch2_verify_btree_nr_keys(b); BUG_ON(!parent && (b != btree_node_root(c, b))); BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + bch2_btree_interior_update_will_free_node(as, b); if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { @@ -1703,7 +1770,11 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t goto split; } - btree_node_interior_verify(c, b); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) { + bch2_btree_node_unlock_write(trans, path, b); + return ret; + } bch2_btree_insert_keys_interior(as, trans, path, b, keys); @@ -1721,7 +1792,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_btree_node_unlock_write(trans, path, b); - btree_node_interior_verify(c, b); + BUG_ON(bch2_btree_node_check_topology(trans, b)); return 0; split: /* @@ -1811,9 +1882,12 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, { struct bch_fs *c = trans->c; struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; + + if (btree_node_fake(b)) + return bch2_btree_split_leaf(trans, path, flags); + struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, - b->c.level, true, flags); + bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -2114,7 +2188,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work) ret = bch2_trans_do(c, NULL, NULL, 0, async_btree_node_rewrite_trans(trans, a)); - bch_err_fn(c, ret); + bch_err_fn_ratelimited(c, ret); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2161,7 +2235,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); } - queue_work(c->btree_interior_update_worker, &a->work); + queue_work(c->btree_node_rewrite_worker, &a->work); } void bch2_do_pending_node_rewrites(struct bch_fs *c) @@ -2173,7 +2247,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c) list_del(&a->list); bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); - queue_work(c->btree_interior_update_worker, &a->work); + queue_work(c->btree_node_rewrite_worker, &a->work); } mutex_unlock(&c->pending_node_rewrites_lock); } @@ -2384,7 +2458,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) bch2_btree_set_root_inmem(c, b); } -static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) +static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level) { struct bch_fs *c = trans->c; struct closure cl; @@ -2403,7 +2477,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) set_btree_node_fake(b); set_btree_node_need_rewrite(b); - b->c.level = 0; + b->c.level = level; b->c.btree_id = id; bkey_btree_ptr_init(&b->key); @@ -2430,9 +2504,9 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) return 0; } -void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) +void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); + bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level)); } void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) @@ -2441,12 +2515,12 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) - prt_printf(out, "%p m %u w %u r %u j %llu\n", - as, - as->mode, - as->nodes_written, - closure_nr_remaining(&as->cl), - as->journal.seq); + prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + (void *) as->ip_started, + as->mode, + as->nodes_written, + closure_nr_remaining(&as->cl), + as->journal.seq); mutex_unlock(&c->btree_interior_update_lock); } @@ -2510,6 +2584,8 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c, void bch2_fs_btree_interior_update_exit(struct bch_fs *c) { + if (c->btree_node_rewrite_worker) + destroy_workqueue(c->btree_node_rewrite_worker); if (c->btree_interior_update_worker) destroy_workqueue(c->btree_interior_update_worker); mempool_exit(&c->btree_interior_update_pool); @@ -2534,6 +2610,11 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c) if (!c->btree_interior_update_worker) return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + c->btree_node_rewrite_worker = + alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); + if (!c->btree_node_rewrite_worker) + return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, sizeof(struct btree_update))) return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; |