summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2024-04-03 16:47:22 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2024-04-03 17:34:02 -0400
commit5639fb38cabaa326b8b664d874a46509d4a60bf2 (patch)
tree3f3aa73ebc9505bede820ce02d7e8a9bad7dfa08
parent9f4ed5ce05888b62d1e4417323c553e5d06f4abf (diff)
Update bcachefs sources to 09d4c2acbf4c bcachefs: reconstruct_inode()
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--.bcachefs_revision2
-rw-r--r--Makefile.compiler2
-rw-r--r--c_src/cmd_migrate.c6
-rw-r--r--c_src/libbcachefs.h2
-rw-r--r--c_src/tools-util.c6
-rw-r--r--include/linux/blk_types.h23
-rw-r--r--include/linux/blkdev.h36
-rw-r--r--include/linux/poison.h3
-rw-r--r--libbcachefs/alloc_background.c47
-rw-r--r--libbcachefs/alloc_foreground.c4
-rw-r--r--libbcachefs/alloc_types.h3
-rw-r--r--libbcachefs/backpointers.c173
-rw-r--r--libbcachefs/bcachefs.h3
-rw-r--r--libbcachefs/bcachefs_format.h1
-rw-r--r--libbcachefs/btree_io.c14
-rw-r--r--libbcachefs/btree_trans_commit.c3
-rw-r--r--libbcachefs/btree_update_interior.c49
-rw-r--r--libbcachefs/btree_update_interior.h24
-rw-r--r--libbcachefs/buckets.h1
-rw-r--r--libbcachefs/data_update.c3
-rw-r--r--libbcachefs/error.c4
-rw-r--r--libbcachefs/eytzinger.c8
-rw-r--r--libbcachefs/fsck.c244
-rw-r--r--libbcachefs/logged_ops.c7
-rw-r--r--libbcachefs/opts.c4
-rw-r--r--libbcachefs/opts.h7
-rw-r--r--libbcachefs/recovery.c61
-rw-r--r--libbcachefs/recovery.h2
-rw-r--r--libbcachefs/recovery_passes.c65
-rw-r--r--libbcachefs/recovery_passes.h1
-rw-r--r--libbcachefs/recovery_passes_types.h1
-rw-r--r--libbcachefs/sb-errors_types.h4
-rw-r--r--libbcachefs/snapshot.c175
-rw-r--r--libbcachefs/snapshot.h26
-rw-r--r--libbcachefs/super-io.c29
-rw-r--r--libbcachefs/super.c13
-rw-r--r--libbcachefs/super_types.h2
-rw-r--r--libbcachefs/util.h10
-rw-r--r--linux/blkdev.c25
39 files changed, 876 insertions, 217 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 9abb2e20..0370dc87 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-794723fc10c4a1ff28d4b11c436277ba783f47e6
+09d4c2acbf4c864fef0f520bbcba256c9a19102e
diff --git a/Makefile.compiler b/Makefile.compiler
index 8fcb4274..92be0c9a 100644
--- a/Makefile.compiler
+++ b/Makefile.compiler
@@ -38,7 +38,7 @@ as-option = $(call try-run,\
# Usage: aflags-y += $(call as-instr,instr,option1,option2)
as-instr = $(call try-run,\
- printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
+ printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -Wa$(comma)--fatal-warnings -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
# __cc-option
# Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586)
diff --git a/c_src/cmd_migrate.c b/c_src/cmd_migrate.c
index e26b3bb8..5ac326dd 100644
--- a/c_src/cmd_migrate.c
+++ b/c_src/cmd_migrate.c
@@ -676,12 +676,12 @@ static int migrate_fs(const char *fs_path,
struct dev_opts dev = dev_opts_default();
dev.path = dev_t_to_path(stat.st_dev);
- dev.handle = bdev_open_by_path(dev.path, BLK_OPEN_READ|BLK_OPEN_WRITE, &dev, NULL);
+ dev.file = bdev_file_open_by_path(dev.path, BLK_OPEN_READ|BLK_OPEN_WRITE, &dev, NULL);
- int ret = PTR_ERR_OR_ZERO(dev.handle);
+ int ret = PTR_ERR_OR_ZERO(dev.file);
if (ret < 0)
die("Error opening device to format %s: %s", dev.path, strerror(-ret));
- dev.bdev = dev.handle->bdev;
+ dev.bdev = file_bdev(dev.file);
opt_set(fs_opts, block_size, get_blocksize(dev.bdev->bd_fd));
diff --git a/c_src/libbcachefs.h b/c_src/libbcachefs.h
index 739783f0..05720573 100644
--- a/c_src/libbcachefs.h
+++ b/c_src/libbcachefs.h
@@ -52,7 +52,7 @@ static inline struct format_opts format_opts_default()
}
struct dev_opts {
- struct bdev_handle *handle;
+ struct file *file;
struct block_device *bdev;
char *path;
u64 size; /* bytes*/
diff --git a/c_src/tools-util.c b/c_src/tools-util.c
index d8371c24..7da1d689 100644
--- a/c_src/tools-util.c
+++ b/c_src/tools-util.c
@@ -189,13 +189,13 @@ int open_for_format(struct dev_opts *dev, bool force)
const char *fs_type = NULL, *fs_label = NULL;
size_t fs_type_len, fs_label_len;
- dev->handle = bdev_open_by_path(dev->path,
+ dev->file = bdev_file_open_by_path(dev->path,
BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL|BLK_OPEN_BUFFERED,
dev, NULL);
- int ret = PTR_ERR_OR_ZERO(dev->handle);
+ int ret = PTR_ERR_OR_ZERO(dev->file);
if (ret < 0)
die("Error opening device to format %s: %s", dev->path, strerror(-ret));
- dev->bdev = dev->handle->bdev;
+ dev->bdev = file_bdev(dev->file);
if (!(pr = blkid_new_probe()))
die("blkid error 1");
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 80560ab6..3cbf8c9e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -16,6 +16,28 @@ typedef void (bio_end_io_t) (struct bio *);
#define BDEVNAME_SIZE 32
+typedef unsigned int __bitwise blk_mode_t;
+
+/* open for reading */
+#define BLK_OPEN_READ ((__force blk_mode_t)(1 << 0))
+/* open for writing */
+#define BLK_OPEN_WRITE ((__force blk_mode_t)(1 << 1))
+/* open exclusively (vs other exclusive openers */
+#define BLK_OPEN_EXCL ((__force blk_mode_t)(1 << 2))
+/* opened with O_NDELAY */
+#define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3))
+/* open for "writes" only for ioctls (specialy hack for floppy.c) */
+#define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4))
+
+#define BLK_OPEN_BUFFERED ((__force blk_mode_t)(1 << 5))
+
+struct inode {
+ unsigned long i_ino;
+ loff_t i_size;
+ struct super_block *i_sb;
+ blk_mode_t mode;
+};
+
struct request_queue {
struct backing_dev_info *backing_dev_info;
};
@@ -34,6 +56,7 @@ struct block_device {
dev_t bd_dev;
char name[BDEVNAME_SIZE];
struct inode *bd_inode;
+ struct inode __bd_inode;
struct request_queue queue;
void *bd_holder;
struct gendisk * bd_disk;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 998f5e2c..1d825a70 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -23,27 +23,6 @@ struct user_namespace;
#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK))
#define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi))
-typedef unsigned int __bitwise blk_mode_t;
-
-/* open for reading */
-#define BLK_OPEN_READ ((__force blk_mode_t)(1 << 0))
-/* open for writing */
-#define BLK_OPEN_WRITE ((__force blk_mode_t)(1 << 1))
-/* open exclusively (vs other exclusive openers */
-#define BLK_OPEN_EXCL ((__force blk_mode_t)(1 << 2))
-/* opened with O_NDELAY */
-#define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3))
-/* open for "writes" only for ioctls (specialy hack for floppy.c) */
-#define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4))
-
-#define BLK_OPEN_BUFFERED ((__force blk_mode_t)(1 << 5))
-
-struct inode {
- unsigned long i_ino;
- loff_t i_size;
- struct super_block *i_sb;
-};
-
struct file {
struct inode *f_inode;
};
@@ -89,15 +68,14 @@ struct blk_holder_ops {
void (*mark_dead)(struct block_device *bdev);
};
-struct bdev_handle {
- struct block_device *bdev;
- void *holder;
- blk_mode_t mode;
-};
+static inline struct block_device *file_bdev(struct file *file)
+{
+ return container_of(file->f_inode, struct block_device, __bd_inode);
+}
-void bdev_release(struct bdev_handle *);
-struct bdev_handle *bdev_open_by_path(const char *, blk_mode_t, void *,
- const struct blk_holder_ops *);
+void fput(struct file *);
+struct file *bdev_file_open_by_path(const char *, blk_mode_t, void *,
+ const struct blk_holder_ops *);
int lookup_bdev(const char *path, dev_t *);
struct super_block {
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 27a7dad1..1f0ee245 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -92,4 +92,7 @@
/********** VFS **********/
#define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA))
+/********** lib/stackdepot.c **********/
+#define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA))
+
#endif
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 893e38f9..4ff56fa4 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -1713,34 +1713,37 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
if (ret)
goto out;
- if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
- a->v.gen++;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
- goto write;
- }
-
- if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- bch2_trans_inconsistent(trans,
- "clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
- "%s",
- a->v.journal_seq,
- c->journal.flushed_seq_ondisk,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ if (a->v.dirty_sectors) {
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "attempting to discard bucket with dirty data\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = -EIO;
- }
goto out;
}
if (a->v.data_type != BCH_DATA_need_discard) {
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- bch2_trans_inconsistent(trans,
- "bucket incorrectly set in need_discard btree\n"
- "%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
+ if (data_type_is_empty(a->v.data_type) &&
+ BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+ a->v.gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ goto write;
}
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "bucket incorrectly set in need_discard btree\n"
+ "%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = -EIO;
+ goto out;
+ }
+
+ if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
+ a->v.journal_seq,
+ c->journal.flushed_seq_ondisk,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = -EIO;
goto out;
}
@@ -1835,6 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo
if (ret)
goto err;
+ BUG_ON(a->v.dirty_sectors);
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
a->v.data_type = alloc_data_type(a->v, a->v.data_type);
@@ -1942,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
goto out;
BUG_ON(a->v.data_type != BCH_DATA_cached);
+ BUG_ON(a->v.dirty_sectors);
if (!a->v.cached_sectors)
bch_err(c, "invalidating empty bucket, confused");
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index 214b15c8..a1fc30ad 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
{
switch (watermark) {
- case BCH_WATERMARK_reclaim:
+ case BCH_WATERMARK_interior_updates:
return 0;
+ case BCH_WATERMARK_reclaim:
+ return OPEN_BUCKETS_COUNT / 6;
case BCH_WATERMARK_btree:
case BCH_WATERMARK_btree_copygc:
return OPEN_BUCKETS_COUNT / 4;
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h
index b91b7a46..c2226e94 100644
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -22,7 +22,8 @@ struct bucket_alloc_state {
x(copygc) \
x(btree) \
x(btree_copygc) \
- x(reclaim)
+ x(reclaim) \
+ x(interior_updates)
enum bch_watermark {
#define x(name) BCH_WATERMARK_##name,
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c
index 762c8ddf..114328ac 100644
--- a/libbcachefs/backpointers.c
+++ b/libbcachefs/backpointers.c
@@ -8,6 +8,7 @@
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "checksum.h"
#include "error.h"
#include <linux/mm.h>
@@ -418,6 +419,84 @@ struct extents_to_bp_state {
struct bkey_buf last_flushed;
};
+static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
+ struct bkey_s_c extent, unsigned dev)
+{
+ struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bch2_bkey_drop_device(bkey_i_to_s(n), dev);
+ return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
+static int check_extent_checksum(struct btree_trans *trans,
+ enum btree_id btree, struct bkey_s_c extent,
+ enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct printbuf buf = PRINTBUF;
+ void *data_buf = NULL;
+ struct bio *bio = NULL;
+ size_t bytes;
+ int ret = 0;
+
+ if (bkey_is_btree_ptr(extent.k))
+ return false;
+
+ bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
+ if (p.ptr.dev == dev)
+ goto found;
+ BUG();
+found:
+ if (!p.crc.csum_type)
+ return false;
+
+ bytes = p.crc.compressed_size << 9;
+
+ struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
+ if (!bch2_dev_get_ioref(ca, READ))
+ return false;
+
+ data_buf = kvmalloc(bytes, GFP_KERNEL);
+ if (!data_buf) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL);
+ bio->bi_iter.bi_sector = p.ptr.offset;
+ bch2_bio_map(bio, data_buf, bytes);
+ ret = submit_bio_wait(bio);
+ if (ret)
+ goto err;
+
+ prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
+ prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree));
+ bch2_bkey_val_to_text(&buf, c, extent);
+ prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree));
+ bch2_bkey_val_to_text(&buf, c, extent2);
+
+ struct nonce nonce = extent_nonce(extent.k->version, p.crc);
+ struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
+ if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
+ c, dup_backpointer_to_bad_csum_extent,
+ "%s", buf.buf))
+ ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
+fsck_err:
+err:
+ if (bio)
+ bio_put(bio);
+ kvfree(data_buf);
+ percpu_ref_put(&ca->io_ref);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static int check_bp_exists(struct btree_trans *trans,
struct extents_to_bp_state *s,
struct bpos bucket,
@@ -425,7 +504,8 @@ static int check_bp_exists(struct btree_trans *trans,
struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
- struct btree_iter bp_iter = { NULL };
+ struct btree_iter bp_iter = {};
+ struct btree_iter other_extent_iter = {};
struct printbuf buf = PRINTBUF;
struct bkey_s_c bp_k;
struct bkey_buf tmp;
@@ -433,13 +513,19 @@ static int check_bp_exists(struct btree_trans *trans,
bch2_bkey_buf_init(&tmp);
+ if (!bch2_dev_bucket_exists(c, bucket)) {
+ prt_str(&buf, "extent for nonexistent device:bucket ");
+ bch2_bpos_to_text(&buf, bucket);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ bch_err(c, "%s", buf.buf);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
if (bpos_lt(bucket, s->bucket_start) ||
bpos_gt(bucket, s->bucket_end))
return 0;
- if (!bch2_dev_bucket_exists(c, bucket))
- goto missing;
-
bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
bucket_pos_to_bp(c, bucket, bp.bucket_offset),
0);
@@ -465,21 +551,94 @@ static int check_bp_exists(struct btree_trans *trans,
ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
}
- goto missing;
+
+ goto check_existing_bp;
}
out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &other_extent_iter);
bch2_trans_iter_exit(trans, &bp_iter);
bch2_bkey_buf_exit(&tmp, c);
printbuf_exit(&buf);
return ret;
+check_existing_bp:
+ /* Do we have a backpointer for a different extent? */
+ if (bp_k.k->type != KEY_TYPE_backpointer)
+ goto missing;
+
+ struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v;
+
+ struct bkey_s_c other_extent =
+ bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0);
+ ret = bkey_err(other_extent);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ ret = 0;
+ if (ret)
+ goto err;
+
+ if (!other_extent.k)
+ goto missing;
+
+ if (bch2_extents_match(orig_k, other_extent)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, other_extent);
+ bch_err(c, "%s", buf.buf);
+
+ if (other_extent.k->size <= orig_k.k->size) {
+ ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode);
+ if (ret)
+ goto err;
+ goto out;
+ } else {
+ ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode);
+ if (ret)
+ goto err;
+ goto missing;
+ }
+ }
+
+ ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ ret = 0;
+ goto missing;
+ }
+
+ ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ printbuf_reset(&buf);
+ prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode);
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, other_extent);
+ bch_err(c, "%s", buf.buf);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
missing:
+ printbuf_reset(&buf);
prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
bch2_btree_id_str(bp.btree_id), bp.level);
bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_printf(&buf, "\nbp pos ");
- bch2_bpos_to_text(&buf, bp_iter.pos);
+ prt_printf(&buf, "\n got: ");
+ bch2_bkey_val_to_text(&buf, c, bp_k);
+
+ struct bkey_i_backpointer n_bp_k;
+ bkey_backpointer_init(&n_bp_k.k_i);
+ n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ n_bp_k.v = bp;
+ prt_printf(&buf, "\n want: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index f129fc4f..a31a5f70 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -615,6 +615,7 @@ struct bch_dev {
*/
#define BCH_FS_FLAGS() \
+ x(new_fs) \
x(started) \
x(may_go_rw) \
x(rw) \
@@ -797,6 +798,7 @@ struct bch_fs {
u64 features;
u64 compat;
unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
+ u64 btrees_lost_data;
} sb;
@@ -826,7 +828,6 @@ struct bch_fs {
struct btree_root btree_roots_known[BTREE_ID_NR];
DARRAY(struct btree_root) btree_roots_extra;
struct mutex btree_root_lock;
- unsigned long btrees_lost_data; /* bitmask */
struct btree_cache btree_cache;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index bff8750a..63102992 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -818,6 +818,7 @@ struct bch_sb_field_ext {
struct bch_sb_field field;
__le64 recovery_passes_required[2];
__le64 errors_silent[8];
+ __le64 btrees_lost_data;
};
struct bch_sb_field_downgrade_entry {
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 9cd9b657..d7de82ac 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1264,11 +1264,11 @@ out:
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
- ret == -BCH_ERR_btree_node_read_err_must_retry)
+ ret == -BCH_ERR_btree_node_read_err_must_retry) {
retry_read = 1;
- else {
+ } else {
set_btree_node_read_error(b);
- set_bit(b->c.btree_id, &c->btrees_lost_data);
+ bch2_btree_lost_data(c, b->c.btree_id);
}
goto out;
}
@@ -1330,7 +1330,7 @@ start:
if (!can_retry) {
set_btree_node_read_error(b);
- set_bit(b->c.btree_id, &c->btrees_lost_data);
+ bch2_btree_lost_data(c, b->c.btree_id);
break;
}
}
@@ -1532,7 +1532,7 @@ fsck_err:
if (ret) {
set_btree_node_read_error(b);
- set_bit(b->c.btree_id, &c->btrees_lost_data);
+ bch2_btree_lost_data(c, b->c.btree_id);
} else if (*saw_error)
bch2_btree_node_rewrite_async(c, b);
@@ -1669,7 +1669,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bch2_fatal_error(c);
set_btree_node_read_error(b);
- set_bit(b->c.btree_id, &c->btrees_lost_data);
+ bch2_btree_lost_data(c, b->c.btree_id);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
printbuf_exit(&buf);
@@ -1866,7 +1866,7 @@ static void btree_node_write_work(struct work_struct *work)
} else {
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
- BCH_WATERMARK_reclaim|
+ BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw,
diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c
index 96669fed..aa9da497 100644
--- a/libbcachefs/btree_trans_commit.c
+++ b/libbcachefs/btree_trans_commit.c
@@ -887,6 +887,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
int ret, unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
switch (ret) {
case -BCH_ERR_btree_insert_btree_node_full:
@@ -905,7 +906,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* flag
*/
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
+ watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
}
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 8d74dfcb..32397b99 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -26,6 +26,13 @@
#include <linux/random.h>
+const char * const bch2_btree_update_modes[] = {
+#define x(t) #t,
+ BCH_WATERMARKS()
+#undef x
+ NULL
+};
+
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
btree_path_idx_t, struct btree *, struct keylist *);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
@@ -303,7 +310,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct open_buckets obs = { .nr = 0 };
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
+ unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
? BTREE_NODE_RESERVE
: 0;
int ret;
@@ -687,7 +694,7 @@ static void btree_update_nodes_written(struct btree_update *as)
* which may require allocations as well.
*/
ret = commit_do(trans, &as->disk_res, &journal_seq,
- BCH_WATERMARK_reclaim|
+ BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_journal_reclaim,
@@ -846,11 +853,11 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode != BTREE_UPDATE_none);
BUG_ON(!btree_node_dirty(b));
BUG_ON(!b->c.level);
- as->mode = BTREE_INTERIOR_UPDATING_NODE;
+ as->mode = BTREE_UPDATE_node;
as->b = b;
set_btree_node_write_blocked(b);
@@ -873,7 +880,7 @@ static void btree_update_reparent(struct btree_update *as,
lockdep_assert_held(&c->btree_interior_update_lock);
child->b = NULL;
- child->mode = BTREE_INTERIOR_UPDATING_AS;
+ child->mode = BTREE_UPDATE_update;
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
bch2_update_reparent_journal_pin_flush);
@@ -884,7 +891,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
struct bkey_i *insert = &b->key;
struct bch_fs *c = as->c;
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode != BTREE_UPDATE_none);
BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
ARRAY_SIZE(as->journal_entries));
@@ -898,7 +905,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
- as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+ as->mode = BTREE_UPDATE_root;
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -1076,7 +1083,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
struct bch_fs *c = as->c;
u64 start_time = as->start_time;
- BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode == BTREE_UPDATE_none);
if (as->took_gc_lock)
up_read(&as->c->gc_lock);
@@ -1121,7 +1128,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK;
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark != BCH_WATERMARK_reclaim)
+ watermark < BCH_WATERMARK_reclaim)
journal_flags |= JOURNAL_RES_GET_NONBLOCK;
ret = drop_locks_do(trans,
@@ -1172,7 +1179,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as->c = c;
as->start_time = start_time;
as->ip_started = _RET_IP_;
- as->mode = BTREE_INTERIOR_NO_UPDATE;
+ as->mode = BTREE_UPDATE_none;
+ as->watermark = watermark;
as->took_gc_lock = true;
as->btree_id = path->btree_id;
as->update_level = update_level;
@@ -1217,7 +1225,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
*/
if (bch2_err_matches(ret, ENOSPC) &&
(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark != BCH_WATERMARK_reclaim) {
+ watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
}
@@ -2509,18 +2517,25 @@ void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned lev
bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level));
}
+static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
+{
+ prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+ (void *) as->ip_started,
+ bch2_btree_id_str(as->btree_id),
+ bch2_watermarks[as->watermark],
+ bch2_btree_update_modes[as->mode],
+ as->nodes_written,
+ closure_nr_remaining(&as->cl),
+ as->journal.seq);
+}
+
void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
{
struct btree_update *as;
mutex_lock(&c->btree_interior_update_lock);
list_for_each_entry(as, &c->btree_interior_update_list, list)
- prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
- (void *) as->ip_started,
- as->mode,
- as->nodes_written,
- closure_nr_remaining(&as->cl),
- as->journal.seq);
+ bch2_btree_update_to_text(out, as);
mutex_unlock(&c->btree_interior_update_lock);
}
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index cffb2c67..88dcf5a2 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -12,6 +12,18 @@
int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
+#define BTREE_UPDATE_MODES() \
+ x(none) \
+ x(node) \
+ x(root) \
+ x(update)
+
+enum btree_update_mode {
+#define x(n) BTREE_UPDATE_##n,
+ BTREE_UPDATE_MODES()
+#undef x
+};
+
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
* parent node:
@@ -39,14 +51,8 @@ struct btree_update {
struct list_head list;
struct list_head unwritten_list;
- /* What kind of update are we doing? */
- enum {
- BTREE_INTERIOR_NO_UPDATE,
- BTREE_INTERIOR_UPDATING_NODE,
- BTREE_INTERIOR_UPDATING_ROOT,
- BTREE_INTERIOR_UPDATING_AS,
- } mode;
-
+ enum btree_update_mode mode;
+ enum bch_watermark watermark;
unsigned nodes_written:1;
unsigned took_gc_lock:1;
@@ -56,7 +62,7 @@ struct btree_update {
struct disk_reservation disk_res;
/*
- * BTREE_INTERIOR_UPDATING_NODE:
+ * BTREE_UPDATE_node:
* The update that made the new nodes visible was a regular update to an
* existing interior node - @b. We can't write out the update to @b
* until the new nodes we created are finished writing, so we block @b
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 6387e039..00aaf4bb 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -226,6 +226,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
fallthrough;
case BCH_WATERMARK_btree_copygc:
case BCH_WATERMARK_reclaim:
+ case BCH_WATERMARK_interior_updates:
break;
}
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index b564404d..34731ee0 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -580,8 +580,7 @@ int bch2_data_update_init(struct btree_trans *trans,
move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) ||
- (!atomic_read(&ctxt->read_sectors) &&
- !atomic_read(&ctxt->write_sectors)));
+ list_empty(&ctxt->ios));
if (!locked)
bch2_bucket_nocow_lock(&c->nocow_locks,
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index f942a394..82a6656c 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "error.h"
+#include "journal.h"
#include "recovery_passes.h"
#include "super.h"
#include "thread_with_file.h"
@@ -16,7 +17,8 @@ bool bch2_inconsistent_error(struct bch_fs *c)
return false;
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "inconsistency detected - emergency read only");
+ bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
+ journal_cur_seq(&c->journal));
return true;
case BCH_ON_ERROR_panic:
panic(bch2_fmt(c, "panic after error"));
diff --git a/libbcachefs/eytzinger.c b/libbcachefs/eytzinger.c
index a064ccff..4ce5e957 100644
--- a/libbcachefs/eytzinger.c
+++ b/libbcachefs/eytzinger.c
@@ -115,7 +115,7 @@ static void swap_bytes(void *a, void *b, size_t n)
struct wrapper {
cmp_func_t cmp;
- swap_func_t swap_f;
+ swap_func_t swap;
};
/*
@@ -125,7 +125,7 @@ struct wrapper {
static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
{
if (swap_func == SWAP_WRAPPER) {
- ((const struct wrapper *)priv)->swap_f(a, b, (int)size);
+ ((const struct wrapper *)priv)->swap(a, b, (int)size);
return;
}
@@ -174,7 +174,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size,
int i, c, r;
/* called from 'sort' without swap function, let's pick the default */
- if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_f)
+ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
swap_func = NULL;
if (!swap_func) {
@@ -227,7 +227,7 @@ void eytzinger0_sort(void *base, size_t n, size_t size,
{
struct wrapper w = {
.cmp = cmp_func,
- .swap_f = swap_func,
+ .swap = swap_func,
};
return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index aca6fae4..8e201021 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -63,9 +63,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
u32 *snapshot, u64 *inum)
{
struct bch_subvolume s;
- int ret;
-
- ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+ int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
*snapshot = le32_to_cpu(s.snapshot);
*inum = le64_to_cpu(s.inode);
@@ -158,9 +156,10 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
- ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash_info, &iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
err:
bch_err_fn(c, ret);
@@ -169,7 +168,8 @@ err:
/* Get lost+found, create if it doesn't exist: */
static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
- struct bch_inode_unpacked *lostfound)
+ struct bch_inode_unpacked *lostfound,
+ u64 reattaching_inum)
{
struct bch_fs *c = trans->c;
struct qstr lostfound_str = QSTR("lost+found");
@@ -184,19 +184,36 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
return ret;
subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
- u32 subvol_snapshot;
- ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
- &subvol_snapshot, &root_inum.inum);
- bch_err_msg(c, ret, "looking up root subvol");
+ struct bch_subvolume subvol;
+ ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
+ false, 0, &subvol);
+ bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
+ le32_to_cpu(st.master_subvol), snapshot);
if (ret)
return ret;
+ if (!subvol.inode) {
+ struct btree_iter iter;
+ struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
+ 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(subvol);
+ if (ret)
+ return ret;
+
+ subvol->v.inode = cpu_to_le64(reattaching_inum);
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ root_inum.inum = le64_to_cpu(subvol.inode);
+
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
u32 root_inode_snapshot = snapshot;
ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
- bch_err_msg(c, ret, "looking up root inode");
+ bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
+ root_inum.inum, le32_to_cpu(st.master_subvol));
if (ret)
return ret;
@@ -292,7 +309,7 @@ static int reattach_inode(struct btree_trans *trans,
snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
}
- ret = lookup_lostfound(trans, dirent_snapshot, &lostfound);
+ ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
if (ret)
return ret;
@@ -363,6 +380,112 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume
return ret;
}
+static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
+{
+ struct bch_fs *c = trans->c;
+
+ if (!bch2_snapshot_is_leaf(c, snapshotid)) {
+ bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
+ /*
+ * If inum isn't set, that means we're being called from check_dirents,
+ * not check_inodes - the root of this subvolume doesn't exist or we
+ * would have found it there:
+ */
+ if (!inum) {
+ struct btree_iter inode_iter = {};
+ struct bch_inode_unpacked new_inode;
+ u64 cpu = raw_smp_processor_id();
+
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
+
+ new_inode.bi_subvol = subvolid;
+
+ int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
+ bch2_btree_iter_traverse(&inode_iter) ?:
+ bch2_inode_write(trans, &inode_iter, &new_inode);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ if (ret)
+ return ret;
+
+ inum = new_inode.bi_inum;
+ }
+
+ bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
+
+ struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+ int ret = PTR_ERR_OR_ZERO(new_subvol);
+ if (ret)
+ return ret;
+
+ bkey_subvolume_init(&new_subvol->k_i);
+ new_subvol->k.p.offset = subvolid;
+ new_subvol->v.snapshot = cpu_to_le32(snapshotid);
+ new_subvol->v.inode = cpu_to_le64(inum);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
+ if (ret)
+ return ret;
+
+ struct btree_iter iter;
+ struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshots, POS(0, snapshotid),
+ 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(s);
+ bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
+ if (ret)
+ return ret;
+
+ u32 snapshot_tree = le32_to_cpu(s->v.tree);
+
+ s->v.subvol = cpu_to_le32(subvolid);
+ SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
+ bch2_trans_iter_exit(trans, &iter);
+
+ struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
+ 0, snapshot_tree);
+ ret = PTR_ERR_OR_ZERO(st);
+ bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
+ if (ret)
+ return ret;
+
+ if (!st->v.master_subvol)
+ st->v.master_subvol = cpu_to_le32(subvolid);
+
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
+static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked new_inode;
+
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
+ new_inode.bi_size = size;
+ new_inode.bi_inum = inum;
+
+ return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
+}
+
+static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
+{
+ struct btree_iter iter = {};
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
+}
+
struct snapshots_seen_entry {
u32 id;
u32 equiv;
@@ -1064,6 +1187,11 @@ static int check_inode(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
+ if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+ ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
+ goto do_update;
+ }
+
if (fsck_err_on(ret,
c, inode_bi_subvol_missing,
"inode %llu:%u bi_subvol points to missing subvolume %u",
@@ -1081,7 +1209,7 @@ static int check_inode(struct btree_trans *trans,
do_update = true;
}
}
-
+do_update:
if (do_update) {
ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck updating inode");
@@ -1130,8 +1258,8 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
i->count = count2;
if (i->count != count2) {
- bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->snapshot, i->count, count2);
+ bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
return -BCH_ERR_internal_fsck_err;
}
@@ -1371,10 +1499,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
goto err;
}
- ret = extent_ends_at(c, extent_ends, seen, k);
- if (ret)
- goto err;
-
extent_ends->last_pos = k.k->p;
err:
return ret;
@@ -1438,6 +1562,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
if (k.k->type != KEY_TYPE_whiteout) {
+ if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ inode->last_pos.inode--;
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
if (fsck_err_on(!i, c, extent_in_missing_inode,
"extent in missing inode:\n %s",
(printbuf_reset(&buf),
@@ -1504,6 +1639,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
i->seen_this_pos = true;
}
+
+ if (k.k->type != KEY_TYPE_whiteout) {
+ ret = extent_ends_at(c, extent_ends, s, k);
+ if (ret)
+ goto err;
+ }
out:
err:
fsck_err:
@@ -1584,8 +1725,8 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
return count2;
if (i->count != count2) {
- bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
- i->count, count2);
+ bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
i->count = count2;
if (i->inode.bi_nlink == i->count)
continue;
@@ -1782,6 +1923,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
u32 parent_snapshot;
+ u32 new_parent_subvol = 0;
u64 parent_inum;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -1790,6 +1932,27 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
+ if (ret ||
+ (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
+ int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
+ if (ret2 && !bch2_err_matches(ret, ENOENT))
+ return ret2;
+ }
+
+ if (ret &&
+ !new_parent_subvol &&
+ (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+ /*
+ * Couldn't find a subvol for dirent's snapshot - but we lost
+ * subvols, so we need to reconstruct:
+ */
+ ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
+ if (ret)
+ return ret;
+
+ parent_snapshot = d.k->p.snapshot;
+ }
+
if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
"dirent parent_subvol points to missing subvolume\n%s",
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
@@ -1798,10 +1961,10 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
parent_snapshot,
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
- u32 new_parent_subvol;
- ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
- if (ret)
- goto err;
+ if (!new_parent_subvol) {
+ bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
ret = PTR_ERR_OR_ZERO(new_dirent);
@@ -1847,9 +2010,16 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
+ goto err;
- if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol,
+ if (ret) {
+ bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = 0;
+ goto err;
+ }
+
+ if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
c, inode_bi_parent_wrong,
"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
target_inum,
@@ -1857,13 +2027,13 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
subvol_root.bi_parent_subvol = parent_subvol;
ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
if (ret)
- return ret;
+ goto err;
}
ret = check_dirent_target(trans, iter, d, &subvol_root,
target_snapshot);
if (ret)
- return ret;
+ goto err;
out:
err:
fsck_err:
@@ -1880,7 +2050,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c_dirent d;
struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
struct bpos equiv;
@@ -1919,6 +2088,17 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
dir->first_this_inode = false;
+ if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ dir->last_pos.inode--;
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
"dirent in nonexisting directory:\n%s",
(printbuf_reset(&buf),
@@ -1953,7 +2133,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type != KEY_TYPE_dirent)
goto out;
- d = bkey_s_c_to_dirent(k);
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL) {
ret = check_dirent_to_subvol(trans, iter, d);
diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c
index 9fac838d..b82f8209 100644
--- a/libbcachefs/logged_ops.c
+++ b/libbcachefs/logged_ops.c
@@ -37,7 +37,6 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
struct bkey_buf sk;
u32 restart_count = trans->restart_count;
- int ret;
if (!fn)
return 0;
@@ -45,11 +44,11 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
bch2_bkey_buf_init(&sk);
bch2_bkey_buf_reassemble(&sk, c, k);
- ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?:
- fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+ fn->resume(trans, sk.k);
bch2_bkey_buf_exit(&sk, c);
- return ret;
+
+ return trans_was_restarted(trans, restart_count);
}
int bch2_resume_logged_ops(struct bch_fs *c)
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index 08ea0cfc..e1800c41 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -7,6 +7,7 @@
#include "disk_groups.h"
#include "error.h"
#include "opts.h"
+#include "recovery_passes.h"
#include "super-io.h"
#include "util.h"
@@ -205,6 +206,9 @@ const struct bch_option bch2_opt_table[] = {
#define OPT_STR(_choices) .type = BCH_OPT_STR, \
.min = 0, .max = ARRAY_SIZE(_choices), \
.choices = _choices
+#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
+ .min = 0, .max = U64_MAX, \
+ .choices = _choices
#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 9da66395..1ac4135c 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -362,7 +362,12 @@ enum fsck_err_opts {
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
- NULL, "Don't replay the journal") \
+ NULL, "Exit recovery immediately prior to journal replay")\
+ x(recovery_pass_last, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_STR_NOLIMIT(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Exit recovery after specified pass") \
x(retain_recovery_info, u8, \
0, \
OPT_BOOL(), \
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index b2b8fdf2..b76c1615 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -33,6 +33,20 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
+{
+ u64 b = BIT_ULL(btree);
+
+ if (!(c->sb.btrees_lost_data & b)) {
+ bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree));
+
+ mutex_lock(&c->sb_lock);
+ bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+}
+
static bool btree_id_is_alloc(enum btree_id id)
{
switch (id) {
@@ -272,7 +286,8 @@ int bch2_journal_replay(struct bch_fs *c)
bch2_trans_put(trans);
trans = NULL;
- if (!c->opts.retain_recovery_info)
+ if (!c->opts.retain_recovery_info &&
+ c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
bch2_journal_keys_put_initial(c);
replay_now_at(j, j->replay_journal_seq_end);
@@ -468,8 +483,8 @@ static int read_btree_roots(struct bch_fs *c)
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
}
- set_bit(i, &c->btrees_lost_data);
ret = 0;
+ bch2_btree_lost_data(c, i);
}
}
@@ -590,27 +605,14 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (c->opts.fsck && c->opts.norecovery) {
- bch_err(c, "cannot select both norecovery and fsck");
- ret = -EINVAL;
- goto err;
- }
-
- c->opts.retain_recovery_info |= c->opts.norecovery;
- c->opts.nochanges |= c->opts.norecovery;
+ if (c->opts.norecovery)
+ c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1;
if (!c->opts.nochanges) {
mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
bool write_sb = false;
- struct bch_sb_field_ext *ext =
- bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
- if (!ext) {
- ret = -BCH_ERR_ENOSPC_sb;
- mutex_unlock(&c->sb_lock);
- goto err;
- }
-
if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
ext->recovery_passes_required[0] |=
cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
@@ -841,6 +843,7 @@ use_clean:
}
mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
bool write_sb = false;
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
@@ -854,15 +857,18 @@ use_clean:
write_sb = true;
}
- if (!test_bit(BCH_FS_error, &c->flags)) {
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- if (ext &&
- (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
- !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) {
- memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required));
- memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
- write_sb = true;
- }
+ if (!test_bit(BCH_FS_error, &c->flags) &&
+ !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
+ memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
+ write_sb = true;
+ }
+
+ if (c->opts.fsck &&
+ !test_bit(BCH_FS_error, &c->flags) &&
+ c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
+ ext->btrees_lost_data) {
+ ext->btrees_lost_data = 0;
+ write_sb = true;
}
if (c->opts.fsck &&
@@ -932,6 +938,7 @@ int bch2_fs_initialize(struct bch_fs *c)
int ret;
bch_notice(c, "initializing new filesystem");
+ set_bit(BCH_FS_new_fs, &c->flags);
mutex_lock(&c->sb_lock);
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h
index 3962fd87..4bf818de 100644
--- a/libbcachefs/recovery.h
+++ b/libbcachefs/recovery.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
+void bch2_btree_lost_data(struct bch_fs *, enum btree_id);
+
int bch2_journal_replay(struct bch_fs *);
int bch2_fs_recovery(struct bch_fs *);
diff --git a/libbcachefs/recovery_passes.c b/libbcachefs/recovery_passes.c
index 6fa095fe..cb501460 100644
--- a/libbcachefs/recovery_passes.c
+++ b/libbcachefs/recovery_passes.c
@@ -17,6 +17,7 @@
#include "snapshot.h"
#include "subvolume.h"
#include "super.h"
+#include "super-io.h"
const char * const bch2_recovery_passes[] = {
#define x(_fn, ...) #_fn,
@@ -27,7 +28,7 @@ const char * const bch2_recovery_passes[] = {
static int bch2_check_allocations(struct bch_fs *c)
{
- return bch2_gc(c, true, c->opts.norecovery);
+ return bch2_gc(c, true, false);
}
static int bch2_set_may_go_rw(struct bch_fs *c)
@@ -59,18 +60,23 @@ static struct recovery_pass_fn recovery_pass_fns[] = {
#undef x
};
-u64 bch2_recovery_passes_to_stable(u64 v)
-{
- static const u8 map[] = {
+static const u8 passes_to_stable_map[] = {
#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
BCH_RECOVERY_PASSES()
#undef x
- };
+};
+
+static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
+{
+ return passes_to_stable_map[pass];
+}
+u64 bch2_recovery_passes_to_stable(u64 v)
+{
u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
+ ret |= BIT_ULL(passes_to_stable_map[i]);
return ret;
}
@@ -113,6 +119,38 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
}
}
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (!test_bit_le64(s, ext->recovery_passes_required)) {
+ __set_bit_le64(s, ext->recovery_passes_required);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+
+ return bch2_run_explicit_recovery_pass(c, pass);
+}
+
+static void bch2_clear_recovery_pass_required(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (test_bit_le64(s, ext->recovery_passes_required)) {
+ __clear_bit_le64(s, ext->recovery_passes_required);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+}
+
u64 bch2_fsck_recovery_passes(void)
{
u64 ret = 0;
@@ -127,8 +165,6 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa
{
struct recovery_pass_fn *p = recovery_pass_fns + pass;
- if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
- return false;
if (c->recovery_passes_explicit & BIT_ULL(pass))
return true;
if ((p->when & PASS_FSCK) && c->opts.fsck)
@@ -184,6 +220,10 @@ int bch2_run_recovery_passes(struct bch_fs *c)
int ret = 0;
while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+ if (c->opts.recovery_pass_last &&
+ c->curr_recovery_pass > c->opts.recovery_pass_last)
+ break;
+
if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
unsigned pass = c->curr_recovery_pass;
@@ -196,8 +236,13 @@ int bch2_run_recovery_passes(struct bch_fs *c)
c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
}
- c->curr_recovery_pass++;
+
c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
+
+ if (!test_bit(BCH_FS_error, &c->flags))
+ bch2_clear_recovery_pass_required(c, c->curr_recovery_pass);
+
+ c->curr_recovery_pass++;
}
return ret;
diff --git a/libbcachefs/recovery_passes.h b/libbcachefs/recovery_passes.h
index abefa677..99b464e1 100644
--- a/libbcachefs/recovery_passes.h
+++ b/libbcachefs/recovery_passes.h
@@ -9,6 +9,7 @@ u64 bch2_recovery_passes_from_stable(u64 v);
u64 bch2_fsck_recovery_passes(void);
int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
int bch2_run_online_recovery_passes(struct bch_fs *);
int bch2_run_recovery_passes(struct bch_fs *);
diff --git a/libbcachefs/recovery_passes_types.h b/libbcachefs/recovery_passes_types.h
index 840542cf..773aea9a 100644
--- a/libbcachefs/recovery_passes_types.h
+++ b/libbcachefs/recovery_passes_types.h
@@ -32,6 +32,7 @@
x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
x(bucket_gens_init, 17, 0) \
+ x(reconstruct_snapshots, 38, 0) \
x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
diff --git a/libbcachefs/sb-errors_types.h b/libbcachefs/sb-errors_types.h
index 2fec03a2..d7d60913 100644
--- a/libbcachefs/sb-errors_types.h
+++ b/libbcachefs/sb-errors_types.h
@@ -268,7 +268,9 @@
x(btree_node_bkey_bad_u64s, 260) \
x(btree_node_topology_empty_interior_node, 261) \
x(btree_ptr_v2_min_key_bad, 262) \
- x(btree_root_unreadable_and_scan_found_nothing, 263)
+ x(btree_root_unreadable_and_scan_found_nothing, 263) \
+ x(snapshot_node_missing, 264) \
+ x(dup_backpointer_to_bad_csum_extent, 265)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c
index 4e074136..0e806f04 100644
--- a/libbcachefs/snapshot.c
+++ b/libbcachefs/snapshot.c
@@ -8,6 +8,7 @@
#include "errcode.h"
#include "error.h"
#include "fs.h"
+#include "recovery_passes.h"
#include "snapshot.h"
#include <linux/random.h>
@@ -131,7 +132,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
rcu_read_lock();
struct snapshot_table *t = rcu_dereference(c->snapshots);
- if (unlikely(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots)) {
+ if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
goto out;
}
@@ -574,6 +575,13 @@ static int check_snapshot_tree(struct btree_trans *trans,
u32 subvol_id;
ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+ bch_err_fn(c, ret);
+
+ if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
+ ret = 0;
+ goto err;
+ }
+
if (ret)
goto err;
@@ -731,7 +739,6 @@ static int check_snapshot(struct btree_trans *trans,
u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
u32 real_depth;
struct printbuf buf = PRINTBUF;
- bool should_have_subvol;
u32 i, id;
int ret = 0;
@@ -777,7 +784,7 @@ static int check_snapshot(struct btree_trans *trans,
}
}
- should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+ bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
!BCH_SNAPSHOT_DELETED(&s);
if (should_have_subvol) {
@@ -879,6 +886,154 @@ int bch2_check_snapshots(struct bch_fs *c)
return ret;
}
+static int check_snapshot_exists(struct btree_trans *trans, u32 id)
+{
+ struct bch_fs *c = trans->c;
+
+ if (bch2_snapshot_equiv(c, id))
+ return 0;
+
+ u32 tree_id;
+ int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+ if (ret)
+ return ret;
+
+ struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
+ ret = PTR_ERR_OR_ZERO(snapshot);
+ if (ret)
+ return ret;
+
+ bkey_snapshot_init(&snapshot->k_i);
+ snapshot->k.p = POS(0, id);
+ snapshot->v.tree = cpu_to_le32(tree_id);
+ snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+
+ return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
+ bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+ bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
+ bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
+}
+
+/* Figure out which snapshot nodes belong in the same tree: */
+struct snapshot_tree_reconstruct {
+ enum btree_id btree;
+ struct bpos cur_pos;
+ snapshot_id_list cur_ids;
+ DARRAY(snapshot_id_list) trees;
+};
+
+static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
+{
+ darray_for_each(r->trees, i)
+ darray_exit(i);
+ darray_exit(&r->trees);
+ darray_exit(&r->cur_ids);
+}
+
+static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+ return r->btree == BTREE_ID_inodes
+ ? r->cur_pos.offset == pos.offset
+ : r->cur_pos.inode == pos.inode;
+}
+
+static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
+{
+ darray_for_each(*l, i)
+ if (snapshot_list_has_id(r, *i))
+ return true;
+ return false;
+}
+
+static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
+{
+ bool first = true;
+ darray_for_each(*s, i) {
+ if (!first)
+ prt_char(out, ' ');
+ first = false;
+ prt_printf(out, "%u", *i);
+ }
+}
+
+static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
+{
+ if (r->cur_ids.nr) {
+ darray_for_each(r->trees, i)
+ if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
+ int ret = snapshot_list_merge(c, i, &r->cur_ids);
+ if (ret)
+ return ret;
+ goto out;
+ }
+ darray_push(&r->trees, r->cur_ids);
+ darray_init(&r->cur_ids);
+ }
+out:
+ r->cur_ids.nr = 0;
+ return 0;
+}
+
+static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+ if (!same_snapshot(r, pos))
+ snapshot_tree_reconstruct_next(c, r);
+ r->cur_pos = pos;
+ return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
+}
+
+int bch2_reconstruct_snapshots(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct printbuf buf = PRINTBUF;
+ struct snapshot_tree_reconstruct r = {};
+ int ret = 0;
+
+ for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
+ if (btree_type_has_snapshots(btree)) {
+ r.btree = btree;
+
+ ret = for_each_btree_key(trans, iter, btree, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({
+ get_snapshot_trees(c, &r, k.k->p);
+ }));
+ if (ret)
+ goto err;
+
+ snapshot_tree_reconstruct_next(c, &r);
+ }
+ }
+
+ darray_for_each(r.trees, t) {
+ printbuf_reset(&buf);
+ snapshot_id_list_to_text(&buf, t);
+
+ darray_for_each(*t, id) {
+ if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
+ c, snapshot_node_missing,
+ "snapshot node %u from tree %s missing", *id, buf.buf)) {
+ if (t->nr > 1) {
+ bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
+ }
+
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_snapshot_exists(trans, *id));
+ if (ret)
+ goto err;
+ }
+ }
+ }
+fsck_err:
+err:
+ bch2_trans_put(trans);
+ snapshot_tree_reconstruct_exit(&r);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
/*
* Mark a snapshot as deleted, for future cleanup:
*/
@@ -1689,6 +1844,20 @@ int bch2_snapshots_read(struct bch_fs *c)
POS_MIN, 0, k,
(set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
bch_err_fn(c, ret);
+
+ /*
+ * It's important that we check if we need to reconstruct snapshots
+ * before going RW, so we mark that pass as required in the superblock -
+ * otherwise, we could end up deleting keys with missing snapshot nodes
+ * instead
+ */
+ BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
+ test_bit(BCH_FS_may_go_rw, &c->flags));
+
+ if (bch2_err_matches(ret, EIO) ||
+ (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
+ ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
+
return ret;
}
diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h
index 331f20fd..b7d2fed3 100644
--- a/libbcachefs/snapshot.h
+++ b/libbcachefs/snapshot.h
@@ -209,15 +209,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list
static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
- int ret;
-
BUG_ON(snapshot_list_has_id(s, id));
- ret = darray_push(s, id);
+ int ret = darray_push(s, id);
if (ret)
bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
return ret;
}
+static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+ int ret = snapshot_list_has_id(s, id)
+ ? 0
+ : darray_push(s, id);
+ if (ret)
+ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+ return ret;
+}
+
+static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
+{
+ darray_for_each(*src, i) {
+ int ret = snapshot_list_add_nodup(c, dst, *i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s);
int bch2_snapshot_get_subvol(struct btree_trans *, u32,
@@ -229,6 +248,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *);
+int bch2_reconstruct_snapshots(struct bch_fs *);
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
void bch2_delete_dead_snapshots_work(struct work_struct *);
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 61fc0ce7..e0aa3655 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -142,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
void bch2_free_super(struct bch_sb_handle *sb)
{
kfree(sb->bio);
- if (!IS_ERR_OR_NULL(sb->bdev_handle))
- bdev_release(sb->bdev_handle);
+ if (!IS_ERR_OR_NULL(sb->s_bdev_file))
+ fput(sb->s_bdev_file);
kfree(sb->holder);
kfree(sb->sb_name);
@@ -527,9 +527,11 @@ static void bch2_sb_update(struct bch_fs *c)
memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
- if (ext)
+ if (ext) {
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
+ c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
+ }
for_each_member_device(c, ca) {
struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
@@ -712,23 +714,23 @@ retry:
if (!opt_get(*opts, nochanges))
sb->mode |= BLK_OPEN_WRITE;
- sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (IS_ERR(sb->bdev_handle) &&
- PTR_ERR(sb->bdev_handle) == -EACCES &&
+ sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (IS_ERR(sb->s_bdev_file) &&
+ PTR_ERR(sb->s_bdev_file) == -EACCES &&
opt_get(*opts, read_only)) {
sb->mode &= ~BLK_OPEN_WRITE;
- sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (!IS_ERR(sb->bdev_handle))
+ sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (!IS_ERR(sb->s_bdev_file))
opt_set(*opts, nochanges, true);
}
- if (IS_ERR(sb->bdev_handle)) {
- ret = PTR_ERR(sb->bdev_handle);
+ if (IS_ERR(sb->s_bdev_file)) {
+ ret = PTR_ERR(sb->s_bdev_file);
prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
goto err;
}
- sb->bdev = sb->bdev_handle->bdev;
+ sb->bdev = file_bdev(sb->s_bdev_file);
ret = bch2_sb_realloc(sb, 0);
if (ret) {
@@ -1162,6 +1164,11 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
kfree(errors_silent);
}
+
+ prt_printf(out, "Btrees with missing data:");
+ prt_tab(out);
+ prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
+ prt_newline(out);
}
static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 6269bec9..ed63018f 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -366,7 +366,7 @@ void bch2_fs_read_only(struct bch_fs *c)
!test_bit(BCH_FS_emergency_ro, &c->flags) &&
test_bit(BCH_FS_started, &c->flags) &&
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
- !c->opts.norecovery) {
+ c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_read(&c->btree_cache.dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
@@ -511,7 +511,8 @@ err:
int bch2_fs_read_write(struct bch_fs *c)
{
- if (c->opts.norecovery)
+ if (c->opts.recovery_pass_last &&
+ c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
return -BCH_ERR_erofs_norecovery;
if (c->opts.nochanges)
@@ -1018,8 +1019,16 @@ int bch2_fs_start(struct bch_fs *c)
for_each_online_member(c, ca)
bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
+ struct bch_sb_field_ext *ext =
+ bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
mutex_unlock(&c->sb_lock);
+ if (!ext) {
+ bch_err(c, "insufficient space in superblock for sb_field_ext");
+ ret = -BCH_ERR_ENOSPC_sb;
+ goto err;
+ }
+
for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h
index 0e5a14fc..ec784d97 100644
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -4,7 +4,7 @@
struct bch_sb_handle {
struct bch_sb *sb;
- struct bdev_handle *bdev_handle;
+ struct file *s_bdev_file;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index b4621b68..b7e7c292 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -793,4 +793,14 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr)
addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
}
+static inline void __clear_bit_le64(size_t bit, __le64 *addr)
+{
+ addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline bool test_bit_le64(size_t bit, __le64 *addr)
+{
+ return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
+}
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/linux/blkdev.c b/linux/blkdev.c
index b7f58737..54122955 100644
--- a/linux/blkdev.c
+++ b/linux/blkdev.c
@@ -162,16 +162,18 @@ sector_t get_capacity(struct gendisk *disk)
return bytes >> 9;
}
-void bdev_release(struct bdev_handle *handle)
+void fput(struct file *file)
{
- fdatasync(handle->bdev->bd_fd);
- close(handle->bdev->bd_fd);
- free(handle->bdev);
- free(handle);
+ struct block_device *bdev = file_bdev(file);
+
+ fdatasync(bdev->bd_fd);
+ close(bdev->bd_fd);
+ free(bdev);
+ free(file);
}
-struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
- void *holder, const struct blk_holder_ops *hop)
+struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
+ void *holder, const struct blk_holder_ops *hop)
{
int fd, flags = 0;
@@ -204,13 +206,12 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
bdev->bd_disk = &bdev->__bd_disk;
bdev->bd_disk->bdi = &bdev->bd_disk->__bdi;
bdev->queue.backing_dev_info = bdev->bd_disk->bdi;
+ bdev->bd_inode = &bdev->__bd_inode;
- struct bdev_handle *handle = calloc(sizeof(*handle), 1);
- handle->bdev = bdev;
- handle->holder = holder;
- handle->mode = mode;
+ struct file *file = calloc(sizeof(*file), 1);
+ file->f_inode = bdev->bd_inode;
- return handle;
+ return file;
}
int lookup_bdev(const char *path, dev_t *dev)