summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2025-03-25 13:24:57 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2025-03-25 16:37:24 -0400
commit8af5c93e488481d58f85364278e6037ce354eccc (patch)
tree9022a68ea55211b2cc3bc4aea99b12ff343b7a78
parent6657ce2de3cdb25b14fb0183b90366e3e577fb9a (diff)
Update bcachefs sources to 1392e502d48b bcachefs: Add an "ignore unknown" option to bch2_parse_mount_opts()
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--.bcachefs_revision2
-rw-r--r--c_src/cmd_format.c6
-rw-r--r--c_src/cmd_fsck.c2
-rw-r--r--c_src/cmd_migrate.c7
-rw-r--r--c_src/libbcachefs.c16
-rw-r--r--c_src/libbcachefs.h1
-rw-r--r--c_src/posix_to_bcachefs.c2
-rw-r--r--include/linux/compiler.h1
-rw-r--r--include/linux/kernel.h1
-rw-r--r--include/linux/kmsan-checks.h98
-rw-r--r--libbcachefs/alloc_background.c14
-rw-r--r--libbcachefs/alloc_foreground.c6
-rw-r--r--libbcachefs/alloc_foreground.h2
-rw-r--r--libbcachefs/backpointers.c13
-rw-r--r--libbcachefs/bkey.h1
-rw-r--r--libbcachefs/btree_io.c22
-rw-r--r--libbcachefs/btree_iter.h9
-rw-r--r--libbcachefs/btree_locking.c8
-rw-r--r--libbcachefs/btree_trans_commit.c1
-rw-r--r--libbcachefs/btree_update.c2
-rw-r--r--libbcachefs/btree_update.h2
-rw-r--r--libbcachefs/btree_update_interior.c18
-rw-r--r--libbcachefs/buckets.c66
-rw-r--r--libbcachefs/checksum.c2
-rw-r--r--libbcachefs/compress.c65
-rw-r--r--libbcachefs/data_update.c2
-rw-r--r--libbcachefs/dirent.c51
-rw-r--r--libbcachefs/dirent.h2
-rw-r--r--libbcachefs/disk_accounting.c11
-rw-r--r--libbcachefs/disk_accounting.h26
-rw-r--r--libbcachefs/disk_accounting_format.h90
-rw-r--r--libbcachefs/ec.c16
-rw-r--r--libbcachefs/ec.h2
-rw-r--r--libbcachefs/errcode.h18
-rw-r--r--libbcachefs/error.c2
-rw-r--r--libbcachefs/extents.c44
-rw-r--r--libbcachefs/fs-io-buffered.c2
-rw-r--r--libbcachefs/fs-ioctl.c2
-rw-r--r--libbcachefs/fs.c38
-rw-r--r--libbcachefs/fsck.c235
-rw-r--r--libbcachefs/inode.c11
-rw-r--r--libbcachefs/inode.h1
-rw-r--r--libbcachefs/io_read.c123
-rw-r--r--libbcachefs/io_read.h11
-rw-r--r--libbcachefs/io_write.c297
-rw-r--r--libbcachefs/io_write.h9
-rw-r--r--libbcachefs/journal.c13
-rw-r--r--libbcachefs/journal_io.c2
-rw-r--r--libbcachefs/journal_reclaim.c6
-rw-r--r--libbcachefs/move.c63
-rw-r--r--libbcachefs/move_types.h2
-rw-r--r--libbcachefs/namei.c (renamed from libbcachefs/fs-common.c)180
-rw-r--r--libbcachefs/namei.h (renamed from libbcachefs/fs-common.h)31
-rw-r--r--libbcachefs/opts.c53
-rw-r--r--libbcachefs/opts.h8
-rw-r--r--libbcachefs/rebalance.c6
-rw-r--r--libbcachefs/recovery.c2
-rw-r--r--libbcachefs/sb-counters_format.h1
-rw-r--r--libbcachefs/sb-errors_format.h3
-rw-r--r--libbcachefs/super.c9
-rw-r--r--libbcachefs/sysfs.c116
-rw-r--r--libbcachefs/time_stats.c22
-rw-r--r--libbcachefs/time_stats.h1
-rw-r--r--libbcachefs/trace.h5
-rw-r--r--libbcachefs/util.h4
65 files changed, 1179 insertions, 708 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index e02e3d5c..b81d691d 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-dbe591cee299957e282eb7857edea35050b1d8b5
+e2e7dcddb3660e90a972473bb10de570964754d7
diff --git a/c_src/cmd_format.c b/c_src/cmd_format.c
index 2d900f1e..d8714888 100644
--- a/c_src/cmd_format.c
+++ b/c_src/cmd_format.c
@@ -207,9 +207,8 @@ int cmd_format(int argc, char *argv[])
force = true;
break;
case O_fs_size:
- if (bch2_strtoull_h(optarg, &dev_opts.opts.fs_size))
+ if (bch2_strtoull_h(optarg, &dev_opts.fs_size))
die("invalid filesystem size");
- dev_opts.opts.fs_size_defined = true;
unconsumed_dev_option = true;
break;
case O_superblock_size:
@@ -233,8 +232,7 @@ int cmd_format(int argc, char *argv[])
darray_push(&device_paths, optarg);
dev_opts.path = optarg;
darray_push(&devices, dev_opts);
- dev_opts.opts.fs_size = 0;
- dev_opts.opts.fs_size_defined = 0;
+ dev_opts.fs_size = 0;
unconsumed_dev_option = false;
break;
case O_quiet:
diff --git a/c_src/cmd_fsck.c b/c_src/cmd_fsck.c
index 3a33ca40..859ec731 100644
--- a/c_src/cmd_fsck.c
+++ b/c_src/cmd_fsck.c
@@ -326,7 +326,7 @@ kernel_fsck_err:
} else {
userland_fsck:
printf("Running userspace offline fsck\n");
- ret = bch2_parse_mount_opts(NULL, &opts, &parse_later, opts_str.buf);
+ ret = bch2_parse_mount_opts(NULL, &opts, &parse_later, opts_str.buf, false);
if (ret)
return ret;
diff --git a/c_src/cmd_migrate.c b/c_src/cmd_migrate.c
index a0328ca8..8155a2b3 100644
--- a/c_src/cmd_migrate.c
+++ b/c_src/cmd_migrate.c
@@ -228,9 +228,10 @@ static int migrate_fs(const char *fs_path,
printf("Creating new filesystem on %s in space reserved at %s\n",
dev->path, file_path);
- dev->opts.fs_size = get_size(dev->bdev->bd_fd);
- dev->opts.bucket_size = bch2_pick_bucket_size(fs_opts, devs);
- dev->nbuckets = dev->opts.fs_size / dev->opts.bucket_size;
+ dev->fs_size = get_size(dev->bdev->bd_fd);
+ opt_set(dev->opts, bucket_size, bch2_pick_bucket_size(fs_opts, devs));
+
+ dev->nbuckets = dev->fs_size / dev->opts.bucket_size;
bch2_check_bucket_size(fs_opts, dev);
diff --git a/c_src/libbcachefs.c b/c_src/libbcachefs.c
index 081a8176..0d19b411 100644
--- a/c_src/libbcachefs.c
+++ b/c_src/libbcachefs.c
@@ -78,13 +78,13 @@ u64 bch2_pick_bucket_size(struct bch_opts opts, dev_opts_list devs)
u64 min_dev_size = BCH_MIN_NR_NBUCKETS * bucket_size;
darray_for_each(devs, i)
- if (i->opts.fs_size < min_dev_size)
+ if (i->fs_size < min_dev_size)
die("cannot format %s, too small (%llu bytes, min %llu)",
- i->path, i->opts.fs_size, min_dev_size);
+ i->path, i->fs_size, min_dev_size);
u64 total_fs_size = 0;
darray_for_each(devs, i)
- total_fs_size += i->opts.fs_size;
+ total_fs_size += i->fs_size;
struct sysinfo info;
si_meminfo(&info);
@@ -181,8 +181,8 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs,
/* get device size, if it wasn't specified: */
darray_for_each(devs, i)
- if (!opt_defined(i->opts, fs_size))
- opt_set(i->opts, fs_size, get_size(i->bdev->bd_fd));
+ if (!i->fs_size)
+ i->fs_size = get_size(i->bdev->bd_fd);
/* calculate bucket sizes: */
u64 fs_bucket_size = bch2_pick_bucket_size(fs_opts, devs);
@@ -190,10 +190,10 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs,
darray_for_each(devs, i)
if (!opt_defined(i->opts, bucket_size))
opt_set(i->opts, bucket_size,
- min(fs_bucket_size, dev_max_bucket_size(i->opts.fs_size)));
+ min(fs_bucket_size, dev_max_bucket_size(i->fs_size)));
darray_for_each(devs, i) {
- i->nbuckets = i->opts.fs_size / i->opts.bucket_size;
+ i->nbuckets = i->fs_size / i->opts.bucket_size;
bch2_check_bucket_size(fs_opts, i);
}
@@ -292,7 +292,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs,
bch2_sb_members_cpy_v2_v1(&sb);
darray_for_each(devs, i) {
- u64 size_sectors = i->opts.fs_size >> 9;
+ u64 size_sectors = i->fs_size >> 9;
sb.sb->dev_idx = i - devs.data;
diff --git a/c_src/libbcachefs.h b/c_src/libbcachefs.h
index 141a6e8f..619bbbd5 100644
--- a/c_src/libbcachefs.h
+++ b/c_src/libbcachefs.h
@@ -66,6 +66,7 @@ struct dev_opts {
u64 sb_end;
u64 nbuckets;
+ u64 fs_size;
const char *label; /* make this a bch_opt */
diff --git a/c_src/posix_to_bcachefs.c b/c_src/posix_to_bcachefs.c
index 63aa0937..72ea11b8 100644
--- a/c_src/posix_to_bcachefs.c
+++ b/c_src/posix_to_bcachefs.c
@@ -6,8 +6,8 @@
#include "posix_to_bcachefs.h"
#include "libbcachefs/alloc_foreground.h"
#include "libbcachefs/buckets.h"
-#include "libbcachefs/fs-common.h"
#include "libbcachefs/io_write.h"
+#include "libbcachefs/namei.h"
#include "libbcachefs/str_hash.h"
#include "libbcachefs/xattr.h"
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 3ecc3dd1..451c323d 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -67,6 +67,7 @@
#define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
#define fallthrough __attribute__((__fallthrough__))
#define __noreturn __attribute__((__noreturn__))
+#define __no_kmsan_checks
#ifndef __counted_by
#define __counted_by(nr)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2e2406dc..1e0615fe 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -12,6 +12,7 @@
#include <linux/byteorder.h>
#include <linux/compiler.h>
#include <linux/dcache.h>
+#include <linux/kmsan-checks.h>
#include <linux/math.h>
#include <linux/minmax.h>
diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h
new file mode 100644
index 00000000..e1082dc4
--- /dev/null
+++ b/include/linux/kmsan-checks.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KMSAN checks to be used for one-off annotations in subsystems.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#ifndef _LINUX_KMSAN_CHECKS_H
+#define _LINUX_KMSAN_CHECKS_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_KMSAN
+
+/**
+ * kmsan_poison_memory() - Mark the memory range as uninitialized.
+ * @address: address to start with.
+ * @size: size of buffer to poison.
+ * @flags: GFP flags for allocations done by this function.
+ *
+ * Until other data is written to this range, KMSAN will treat it as
+ * uninitialized. Error reports for this memory will reference the call site of
+ * kmsan_poison_memory() as origin.
+ */
+void kmsan_poison_memory(const void *address, size_t size, gfp_t flags);
+
+/**
+ * kmsan_unpoison_memory() - Mark the memory range as initialized.
+ * @address: address to start with.
+ * @size: size of buffer to unpoison.
+ *
+ * Until other data is written to this range, KMSAN will treat it as
+ * initialized.
+ */
+void kmsan_unpoison_memory(const void *address, size_t size);
+
+/**
+ * kmsan_check_memory() - Check the memory range for being initialized.
+ * @address: address to start with.
+ * @size: size of buffer to check.
+ *
+ * If any piece of the given range is marked as uninitialized, KMSAN will report
+ * an error.
+ */
+void kmsan_check_memory(const void *address, size_t size);
+
+/**
+ * kmsan_copy_to_user() - Notify KMSAN about a data transfer to userspace.
+ * @to: destination address in the userspace.
+ * @from: source address in the kernel.
+ * @to_copy: number of bytes to copy.
+ * @left: number of bytes not copied.
+ *
+ * If this is a real userspace data transfer, KMSAN checks the bytes that were
+ * actually copied to ensure there was no information leak. If @to belongs to
+ * the kernel space (which is possible for compat syscalls), KMSAN just copies
+ * the metadata.
+ */
+void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
+ size_t left);
+
+/**
+ * kmsan_memmove() - Notify KMSAN about a data copy within kernel.
+ * @to: destination address in the kernel.
+ * @from: source address in the kernel.
+ * @size: number of bytes to copy.
+ *
+ * Invoked after non-instrumented version (e.g. implemented using assembly
+ * code) of memmove()/memcpy() is called, in order to copy KMSAN's metadata.
+ */
+void kmsan_memmove(void *to, const void *from, size_t to_copy);
+
+#else
+
+static inline void kmsan_poison_memory(const void *address, size_t size,
+ gfp_t flags)
+{
+}
+static inline void kmsan_unpoison_memory(const void *address, size_t size)
+{
+}
+static inline void kmsan_check_memory(const void *address, size_t size)
+{
+}
+static inline void kmsan_copy_to_user(void __user *to, const void *from,
+ size_t to_copy, size_t left)
+{
+}
+
+static inline void kmsan_memmove(void *to, const void *from, size_t to_copy)
+{
+}
+
+#endif
+
+#endif /* _LINUX_KMSAN_CHECKS_H */
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 54e0cc37..5fb396be 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -777,14 +777,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s
s64 delta_sectors,
s64 delta_fragmented, unsigned flags)
{
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
- .dev_data_type.dev = ca->dev_idx,
- .dev_data_type.data_type = data_type,
- };
s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
- return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
+ return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
+ d, dev_data_type,
+ .dev = ca->dev_idx,
+ .data_type = data_type);
}
int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
@@ -837,7 +835,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
if (!ca)
- return -EIO;
+ return -BCH_ERR_trigger_alloc;
struct bch_alloc_v4 old_a_convert;
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
@@ -1031,7 +1029,7 @@ fsck_err:
invalid_bucket:
bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
(bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
- ret = -EIO;
+ ret = -BCH_ERR_trigger_alloc;
goto err;
}
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index 1759c15a..0cac6534 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -127,14 +127,14 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
void bch2_open_bucket_write_error(struct bch_fs *c,
struct open_buckets *obs,
- unsigned dev)
+ unsigned dev, int err)
{
struct open_bucket *ob;
unsigned i;
open_bucket_for_each(c, obs, ob, i)
if (ob->dev == dev && ob->ec)
- bch2_ec_bucket_cancel(c, ob);
+ bch2_ec_bucket_cancel(c, ob, err);
}
static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
@@ -631,7 +631,7 @@ static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
struct bch_dev_usage *usage)
{
u64 *v = stripe->next_alloc + ca->dev_idx;
- u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
+ u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
u64 free_space_inv = free_space
? div64_u64(1ULL << 48, free_space)
: 1ULL << 48;
diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h
index baf5dc16..69ec6a01 100644
--- a/libbcachefs/alloc_foreground.h
+++ b/libbcachefs/alloc_foreground.h
@@ -82,7 +82,7 @@ static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
}
void bch2_open_bucket_write_error(struct bch_fs *,
- struct open_buckets *, unsigned);
+ struct open_buckets *, unsigned, int);
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c
index c9dfc365..20c497f0 100644
--- a/libbcachefs/backpointers.c
+++ b/libbcachefs/backpointers.c
@@ -50,6 +50,8 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke
}
bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
+ prt_str(out, " data_type=");
+ bch2_prt_data_type(out, bp.v->data_type);
prt_printf(out, " suboffset=%u len=%u gen=%u pos=",
(u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
bp.v->bucket_len,
@@ -782,7 +784,7 @@ enum alloc_sector_counter {
ALLOC_SECTORS_NR
};
-static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t)
+static int data_type_to_alloc_counter(enum bch_data_type t)
{
switch (t) {
case BCH_DATA_btree:
@@ -791,9 +793,10 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t
case BCH_DATA_cached:
return ALLOC_cached;
case BCH_DATA_stripe:
+ case BCH_DATA_parity:
return ALLOC_stripe;
default:
- BUG();
+ return -1;
}
}
@@ -844,7 +847,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
if (bp.v->bucket_gen != a->gen)
continue;
- sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len;
+ int alloc_counter = data_type_to_alloc_counter(bp.v->data_type);
+ if (alloc_counter < 0)
+ continue;
+
+ sectors[alloc_counter] += bp.v->bucket_len;
};
bch2_trans_iter_exit(trans, &iter);
if (ret)
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index 054e2d5e..08263290 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r)
static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
{
return bpos_eq(l.k->p, r.k->p) &&
+ l.k->size == r.k->size &&
bkey_bytes(l.k) == bkey_bytes(r.k) &&
!memcmp(l.v, r.v, bkey_val_bytes(l.k));
}
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 6abc9f17..2ba33ffc 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -2117,8 +2117,14 @@ out:
return;
err:
set_btree_node_noevict(b);
- bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
- "writing btree node: %s", bch2_err_str(ret));
+
+ if (!bch2_err_matches(ret, EROFS)) {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret));
+ bch2_btree_pos_to_text(&buf, c, b);
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
goto out;
}
@@ -2135,10 +2141,14 @@ static void btree_node_write_endio(struct bio *bio)
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
wbio->submit_time, !bio->bi_status);
- if (ca && bio->bi_status)
- bch_err_dev_ratelimited(ca,
- "btree write error: %s",
- bch2_blk_status_to_str(bio->bi_status));
+ if (ca && bio->bi_status) {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "btree write error: %s\n ",
+ bch2_blk_status_to_str(bio->bi_status));
+ bch2_btree_pos_to_text(&buf, c, b);
+ bch_err_dev_ratelimited(ca, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
if (bio->bi_status) {
unsigned long flags;
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index b96157f3..8823eec6 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -335,13 +335,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra
}
__always_inline
-static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
+static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip)
{
BUG_ON(err <= 0);
BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
trans->restarted = err;
trans->last_restarted_ip = ip;
+ return -err;
+}
+
+__always_inline
+static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
+{
+ btree_trans_restart_foreign_task(trans, err, ip);
#ifdef CONFIG_BCACHEFS_DEBUG
darray_exit(&trans->last_restarted_trace);
bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c
index caef65ad..94eb2b73 100644
--- a/libbcachefs/btree_locking.c
+++ b/libbcachefs/btree_locking.c
@@ -91,10 +91,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
struct trans_waiting_for_lock *i;
for (i = g->g; i != g->g + g->nr; i++) {
- struct task_struct *task = i->trans->locking_wait.task;
+ struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
if (i != g->g)
prt_str(out, "<- ");
- prt_printf(out, "%u ", task ?task->pid : 0);
+ prt_printf(out, "%u ", task ? task->pid : 0);
}
prt_newline(out);
}
@@ -172,7 +172,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
{
if (i == g->g) {
trace_would_deadlock(g, i->trans);
- return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+ return btree_trans_restart_foreign_task(i->trans,
+ BCH_ERR_transaction_restart_would_deadlock,
+ _THIS_IP_);
} else {
i->trans->lock_must_abort = true;
wake_up_process(i->trans->locking_wait.task);
diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c
index d50dc31d..7d7e52dd 100644
--- a/libbcachefs/btree_trans_commit.c
+++ b/libbcachefs/btree_trans_commit.c
@@ -164,6 +164,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
+ kmsan_check_memory(insert, bkey_bytes(&insert->k));
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
index b3e346b5..bd2eb42e 100644
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -512,6 +512,8 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
{
+ kmsan_check_memory(k, bkey_bytes(&k->k));
+
btree_path_idx_t path_idx = iter->update_path ?: iter->path;
int ret;
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 47d8690f..d2e1c043 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -133,6 +133,8 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr
enum btree_id btree,
struct bkey_i *k)
{
+ kmsan_check_memory(k, bkey_bytes(&k->k));
+
if (unlikely(!btree_type_uses_write_buffer(btree))) {
int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
dump_stack();
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index d3e0cf01..67f1e320 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -649,6 +649,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
return 0;
}
+/* If the node has been reused, we might be reading uninitialized memory - that's fine: */
+static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq)
+{
+ struct btree_node *b_data = READ_ONCE(b->data);
+
+ return (b_data ? b_data->keys.seq : 0) == seq;
+}
+
static void btree_update_nodes_written(struct btree_update *as)
{
struct bch_fs *c = as->c;
@@ -677,17 +685,9 @@ static void btree_update_nodes_written(struct btree_update *as)
* on disk:
*/
for (i = 0; i < as->nr_old_nodes; i++) {
- __le64 seq;
-
b = as->old_nodes[i];
- bch2_trans_begin(trans);
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- seq = b->data ? b->data->keys.seq : 0;
- six_unlock_read(&b->c.lock);
- bch2_trans_unlock_long(trans);
-
- if (seq == as->old_nodes_seq[i])
+ if (btree_node_seq_matches(b, as->old_nodes_seq[i]))
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
TASK_UNINTERRUPTIBLE);
}
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index bb7742cf..e56ef623 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -724,9 +724,7 @@ static int __trigger_extent(struct btree_trans *trans,
.replicas.nr_required = 1,
};
- struct disk_accounting_pos acct_compression_key = {
- .type = BCH_DISK_ACCOUNTING_compression,
- };
+ unsigned cur_compression_type = 0;
u64 compression_acct[3] = { 1, 0, 0 };
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
@@ -760,13 +758,13 @@ static int __trigger_extent(struct btree_trans *trans,
acc_replicas_key.replicas.nr_required = 0;
}
- if (acct_compression_key.compression.type &&
- acct_compression_key.compression.type != p.crc.compression_type) {
+ if (cur_compression_type &&
+ cur_compression_type != p.crc.compression_type) {
if (flags & BTREE_TRIGGER_overwrite)
bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
- ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
- ARRAY_SIZE(compression_acct), gc);
+ ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
+ compression, cur_compression_type);
if (ret)
return ret;
@@ -775,7 +773,7 @@ static int __trigger_extent(struct btree_trans *trans,
compression_acct[2] = 0;
}
- acct_compression_key.compression.type = p.crc.compression_type;
+ cur_compression_type = p.crc.compression_type;
if (p.crc.compression_type) {
compression_acct[1] += p.crc.uncompressed_size;
compression_acct[2] += p.crc.compressed_size;
@@ -789,45 +787,34 @@ static int __trigger_extent(struct btree_trans *trans,
}
if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
- struct disk_accounting_pos acc_snapshot_key = {
- .type = BCH_DISK_ACCOUNTING_snapshot,
- .snapshot.id = k.k->p.snapshot,
- };
- ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot);
if (ret)
return ret;
}
- if (acct_compression_key.compression.type) {
+ if (cur_compression_type) {
if (flags & BTREE_TRIGGER_overwrite)
bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
- ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
- ARRAY_SIZE(compression_acct), gc);
+ ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
+ compression, cur_compression_type);
if (ret)
return ret;
}
if (level) {
- struct disk_accounting_pos acc_btree_key = {
- .type = BCH_DISK_ACCOUNTING_btree,
- .btree.id = btree_id,
- };
- ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id);
if (ret)
return ret;
} else {
bool insert = !(flags & BTREE_TRIGGER_overwrite);
- struct disk_accounting_pos acc_inum_key = {
- .type = BCH_DISK_ACCOUNTING_inum,
- .inum.inum = k.k->p.inode,
- };
+
s64 v[3] = {
insert ? 1 : -1,
insert ? k.k->size : -((s64) k.k->size),
*replicas_sectors,
};
- ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
+ ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode);
if (ret)
return ret;
}
@@ -876,15 +863,15 @@ int bch2_trigger_extent(struct btree_trans *trans,
}
int need_rebalance_delta = 0;
- s64 need_rebalance_sectors_delta = 0;
+ s64 need_rebalance_sectors_delta[1] = { 0 };
s64 s = bch2_bkey_sectors_need_rebalance(c, old);
need_rebalance_delta -= s != 0;
- need_rebalance_sectors_delta -= s;
+ need_rebalance_sectors_delta[0] -= s;
s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
need_rebalance_delta += s != 0;
- need_rebalance_sectors_delta += s;
+ need_rebalance_sectors_delta[0] += s;
if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
@@ -893,12 +880,9 @@ int bch2_trigger_extent(struct btree_trans *trans,
return ret;
}
- if (need_rebalance_sectors_delta) {
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_rebalance_work,
- };
- int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
- flags & BTREE_TRIGGER_gc);
+ if (need_rebalance_sectors_delta[0]) {
+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
+ need_rebalance_sectors_delta, rebalance_work);
if (ret)
return ret;
}
@@ -914,17 +898,13 @@ static int __trigger_reservation(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
- s64 sectors = k.k->size;
+ s64 sectors[1] = { k.k->size };
if (flags & BTREE_TRIGGER_overwrite)
- sectors = -sectors;
-
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_persistent_reserved,
- .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas,
- };
+ sectors[0] = -sectors[0];
- return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, flags & BTREE_TRIGGER_gc);
+ return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors,
+ persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas);
}
return 0;
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 7f9e4c59..37266890 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -466,7 +466,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
prt_str(&buf, ")");
WARN_RATELIMIT(1, "%s", buf.buf);
printbuf_exit(&buf);
- return -EIO;
+ return -BCH_ERR_recompute_checksum;
}
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index 31467f77..85fc9034 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
size_t src_len = src->bi_iter.bi_size;
size_t dst_len = crc.uncompressed_size << 9;
void *workspace;
- int ret;
+ int ret = 0, ret2;
enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
mempool_t *workspace_pool = &c->compress_workspace[opt];
@@ -189,7 +189,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
else
ret = -BCH_ERR_compression_workspace_not_initialized;
if (ret)
- goto out;
+ goto err;
}
src_data = bio_map_or_bounce(c, src, READ);
@@ -197,10 +197,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
switch (crc.compression_type) {
case BCH_COMPRESSION_TYPE_lz4_old:
case BCH_COMPRESSION_TYPE_lz4:
- ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
- src_len, dst_len, dst_len);
- if (ret != dst_len)
- goto err;
+ ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data,
+ src_len, dst_len, dst_len);
+ if (ret2 != dst_len)
+ ret = -BCH_ERR_decompress_lz4;
break;
case BCH_COMPRESSION_TYPE_gzip: {
z_stream strm = {
@@ -214,45 +214,43 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
zlib_set_workspace(&strm, workspace);
zlib_inflateInit2(&strm, -MAX_WBITS);
- ret = zlib_inflate(&strm, Z_FINISH);
+ ret2 = zlib_inflate(&strm, Z_FINISH);
mempool_free(workspace, workspace_pool);
- if (ret != Z_STREAM_END)
- goto err;
+ if (ret2 != Z_STREAM_END)
+ ret = -BCH_ERR_decompress_gzip;
break;
}
case BCH_COMPRESSION_TYPE_zstd: {
ZSTD_DCtx *ctx;
size_t real_src_len = le32_to_cpup(src_data.b);
- if (real_src_len > src_len - 4)
+ if (real_src_len > src_len - 4) {
+ ret = -BCH_ERR_decompress_zstd_src_len_bad;
goto err;
+ }
workspace = mempool_alloc(workspace_pool, GFP_NOFS);
ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
- ret = zstd_decompress_dctx(ctx,
+ ret2 = zstd_decompress_dctx(ctx,
dst_data, dst_len,
src_data.b + 4, real_src_len);
mempool_free(workspace, workspace_pool);
- if (ret != dst_len)
- goto err;
+ if (ret2 != dst_len)
+ ret = -BCH_ERR_decompress_zstd;
break;
}
default:
BUG();
}
- ret = 0;
+err:
fsck_err:
-out:
bio_unmap_or_unbounce(c, src_data);
return ret;
-err:
- ret = -EIO;
- goto out;
}
int bch2_bio_uncompress_inplace(struct bch_write_op *op,
@@ -268,27 +266,22 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
BUG_ON(!bio->bi_vcnt);
BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
- if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
- crc->compressed_size << 9 > c->opts.encoded_extent_max) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op, op->pos.offset,
- "extent too big to decompress");
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- return -EIO;
+ if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) {
+ bch2_write_op_error(op, op->pos.offset,
+ "extent too big to decompress (%u > %u)",
+ crc->uncompressed_size << 9, c->opts.encoded_extent_max);
+ return -BCH_ERR_decompress_exceeded_max_encoded_extent;
}
data = __bounce_alloc(c, dst_len, WRITE);
- if (__bio_uncompress(c, bio, data.b, *crc)) {
- if (!c->opts.no_data_io) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op, op->pos.offset,
- "decompression error");
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
- ret = -EIO;
+ ret = __bio_uncompress(c, bio, data.b, *crc);
+
+ if (c->opts.no_data_io)
+ ret = 0;
+
+ if (ret) {
+ bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret));
goto err;
}
@@ -321,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max ||
crc.compressed_size << 9 > c->opts.encoded_extent_max)
- return -EIO;
+ return -BCH_ERR_decompress_exceeded_max_encoded_extent;
dst_data = dst_len == dst_iter.bi_size
? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 08bb7f30..0ec273da 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -354,7 +354,7 @@ restart_drop_extra_replicas:
printbuf_exit(&buf);
bch2_fatal_error(c);
- ret = -EIO;
+ ret = -BCH_ERR_invalid_bkey;
goto out;
}
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index f4c283d1..d7f9f793 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -729,3 +729,54 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
return ret < 0 ? ret : 0;
}
+
+/* fsck */
+
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inode_nr)
+ break;
+ if (!bkey_is_inode(k.k))
+ continue;
+ ret = bch2_inode_unpack(k, inode);
+ goto found;
+ }
+ ret = -BCH_ERR_ENOENT_inode;
+found:
+ bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bch_inode_unpacked dir_inode;
+ struct bch_hash_info dir_hash_info;
+ int ret;
+
+ ret = lookup_first_inode(trans, pos.inode, &dir_inode);
+ if (ret)
+ goto err;
+
+ dir_hash_info = bch2_hash_info_init(c, &dir_inode);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
+
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash_info, &iter,
+ BTREE_UPDATE_internal_snapshot_node);
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ bch_err_fn(c, ret);
+ return ret;
+}
diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h
index a6e15a01..0880772b 100644
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@@ -82,4 +82,6 @@ int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
+int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos);
+
#endif /* _BCACHEFS_DIRENT_H */
diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c
index b32e91ba..8a8de614 100644
--- a/libbcachefs/disk_accounting.c
+++ b/libbcachefs/disk_accounting.c
@@ -135,6 +135,12 @@ static inline bool is_zero(char *start, char *end)
#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member))
+static const unsigned bch2_accounting_type_nr_counters[] = {
+#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr,
+ BCH_DISK_ACCOUNTING_TYPES()
+#undef x
+};
+
int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_validate_context from)
{
@@ -193,6 +199,11 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
c, accounting_key_junk_at_end,
"junk at end of accounting key");
+
+ bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type],
+ c, accounting_key_nr_counters_wrong,
+ "accounting key with %u counters, should be %u",
+ bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]);
fsck_err:
return ret;
}
diff --git a/libbcachefs/disk_accounting.h b/libbcachefs/disk_accounting.h
index f4372caf..abb1f620 100644
--- a/libbcachefs/disk_accounting.h
+++ b/libbcachefs/disk_accounting.h
@@ -33,10 +33,12 @@ static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
struct bkey_s_c_accounting src)
{
- EBUG_ON(dst->k.u64s != src.k->u64s);
-
- for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
+ for (unsigned i = 0;
+ i < min(bch2_accounting_counters(&dst->k),
+ bch2_accounting_counters(src.k));
+ i++)
dst->v.d[i] += src.v->d[i];
+
if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
dst->k.bversion = src.k->bversion;
}
@@ -85,6 +87,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos
int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
s64 *, unsigned, bool);
+
+#define disk_accounting_key_init(_k, _type, ...) \
+do { \
+ memset(&(_k), 0, sizeof(_k)); \
+ (_k).type = BCH_DISK_ACCOUNTING_##_type; \
+ (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \
+} while (0)
+
+#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \
+({ \
+ struct disk_accounting_pos pos; \
+ disk_accounting_key_init(pos, __VA_ARGS__); \
+ bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \
+})
+
+#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \
+ bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__)
+
int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
diff --git a/libbcachefs/disk_accounting_format.h b/libbcachefs/disk_accounting_format.h
index 7b6e6c97..8269af1d 100644
--- a/libbcachefs/disk_accounting_format.h
+++ b/libbcachefs/disk_accounting_format.h
@@ -95,40 +95,81 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
}
}
+/*
+ * field 1: name
+ * field 2: id
+ * field 3: number of counters (max 3)
+ */
+
#define BCH_DISK_ACCOUNTING_TYPES() \
- x(nr_inodes, 0) \
- x(persistent_reserved, 1) \
- x(replicas, 2) \
- x(dev_data_type, 3) \
- x(compression, 4) \
- x(snapshot, 5) \
- x(btree, 6) \
- x(rebalance_work, 7) \
- x(inum, 8)
+ x(nr_inodes, 0, 1) \
+ x(persistent_reserved, 1, 1) \
+ x(replicas, 2, 1) \
+ x(dev_data_type, 3, 3) \
+ x(compression, 4, 3) \
+ x(snapshot, 5, 1) \
+ x(btree, 6, 1) \
+ x(rebalance_work, 7, 1) \
+ x(inum, 8, 3)
enum disk_accounting_type {
-#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr,
+#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr,
BCH_DISK_ACCOUNTING_TYPES()
#undef x
BCH_DISK_ACCOUNTING_TYPE_NR,
};
-struct bch_nr_inodes {
+/*
+ * No subtypes - number of inodes in the entire filesystem
+ *
+ * XXX: perhaps we could add a per-subvolume counter?
+ */
+struct bch_acct_nr_inodes {
};
-struct bch_persistent_reserved {
+/*
+ * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the
+ * reservation:
+ */
+struct bch_acct_persistent_reserved {
__u8 nr_replicas;
};
-struct bch_dev_data_type {
+/*
+ * device, data type counter fields:
+ * [
+ * nr_buckets
+ * live sectors (in buckets of that data type)
+ * sectors of internal fragmentation
+ * ]
+ *
+ * XXX: live sectors should've been done differently, you can have multiple data
+ * types in the same bucket (user, stripe, cached) and this collapses them to
+ * the bucket data type, and makes the internal fragmentation counter redundant
+ */
+struct bch_acct_dev_data_type {
__u8 dev;
__u8 data_type;
};
+/*
+ * Compression type fields:
+ * [
+ * number of extents
+ * uncompressed size
+ * compressed size
+ * ]
+ *
+ * Compression ratio, average extent size (fragmentation).
+ */
struct bch_acct_compression {
__u8 type;
};
+/*
+ * On disk usage by snapshot id; counts same values as replicas counter, but
+ * aggregated differently
+ */
struct bch_acct_snapshot {
__u32 id;
} __packed;
@@ -137,10 +178,27 @@ struct bch_acct_btree {
__u32 id;
} __packed;
+/*
+ * inum counter fields:
+ * [
+ * number of extents
+ * sum of extent sizes - bkey size
+ * this field is similar to inode.bi_sectors, except here extents in
+ * different snapshots but the same inode number are all collapsed to the
+ * same counter
+ * sum of on disk size - same values tracked by replicas counters
+ * ]
+ *
+ * This tracks on disk fragmentation.
+ */
struct bch_acct_inum {
__u64 inum;
} __packed;
+/*
+ * Simple counter of the amount of data (on disk sectors) rebalance needs to
+ * move, extents counted here are also in the rebalance_work btree.
+ */
struct bch_acct_rebalance_work {
};
@@ -149,10 +207,10 @@ struct disk_accounting_pos {
struct {
__u8 type;
union {
- struct bch_nr_inodes nr_inodes;
- struct bch_persistent_reserved persistent_reserved;
+ struct bch_acct_nr_inodes nr_inodes;
+ struct bch_acct_persistent_reserved persistent_reserved;
struct bch_replicas_entry_v1 replicas;
- struct bch_dev_data_type dev_data_type;
+ struct bch_acct_dev_data_type dev_data_type;
struct bch_acct_compression compression;
struct bch_acct_snapshot snapshot;
struct bch_acct_btree btree;
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index c73ba73f..f2b9225f 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -1124,7 +1124,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
bch2_fs_inconsistent(c, "%s", buf.buf);
printbuf_exit(&buf);
- return -EIO;
+ return -BCH_ERR_erasure_coding_found_btree_node;
}
k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
@@ -1190,7 +1190,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
if (!ca)
- return -EIO;
+ return -BCH_ERR_ENOENT_dev_not_found;
struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
@@ -1227,21 +1227,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
{
struct btree_trans *trans = bch2_trans_get(c);
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
- int ret = 0;
+ unsigned nr_data = v->nr_blocks - v->nr_redundant;
- ret = bch2_btree_write_buffer_flush_sync(trans);
+ int ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
- for (i = 0; i < nr_data; i++) {
+ for (unsigned i = 0; i < nr_data; i++) {
ret = ec_stripe_update_bucket(trans, s, i);
if (ret)
break;
}
err:
bch2_trans_put(trans);
-
return ret;
}
@@ -1451,11 +1449,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int
ec_stripe_new_set_pending(c, h);
}
-void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err)
{
struct ec_stripe_new *s = ob->ec;
- s->err = -EIO;
+ s->err = err;
}
void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
index 8f2228e5..62d27e04 100644
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@@ -249,7 +249,7 @@ int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index c179954a..101806d7 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -116,6 +116,7 @@
x(ENOENT, ENOENT_snapshot_tree) \
x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
x(ENOENT, ENOENT_dev_not_found) \
+ x(ENOENT, ENOENT_dev_bucket_not_found) \
x(ENOENT, ENOENT_dev_idx_not_found) \
x(ENOENT, ENOENT_inode_no_backpointer) \
x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
@@ -207,6 +208,7 @@
x(EINVAL, no_resize_with_buckets_nouse) \
x(EINVAL, inode_unpack_error) \
x(EINVAL, varint_decode_error) \
+ x(EINVAL, erasure_coding_found_btree_node) \
x(EOPNOTSUPP, may_not_use_incompat_feature) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
@@ -267,6 +269,7 @@
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, journal_shutdown) \
x(EIO, journal_flush_err) \
+ x(EIO, journal_write_err) \
x(EIO, btree_node_read_err) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \
x(EIO, sb_not_downgraded) \
@@ -275,6 +278,7 @@
x(EIO, btree_node_read_validate_error) \
x(EIO, btree_need_topology_repair) \
x(EIO, bucket_ref_update) \
+ x(EIO, trigger_alloc) \
x(EIO, trigger_pointer) \
x(EIO, trigger_stripe_pointer) \
x(EIO, metadata_bucket_inconsistency) \
@@ -290,7 +294,19 @@
x(EIO, EIO_fault_injected) \
x(EIO, ec_block_read) \
x(EIO, ec_block_write) \
- x(EIO, data_read) \
+ x(EIO, recompute_checksum) \
+ x(EIO, decompress) \
+ x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \
+ x(BCH_ERR_decompress, decompress_lz4) \
+ x(BCH_ERR_decompress, decompress_gzip) \
+ x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \
+ x(BCH_ERR_decompress, decompress_zstd) \
+ x(EIO, data_write) \
+ x(BCH_ERR_data_write, data_write_io) \
+ x(BCH_ERR_data_write, data_write_csum) \
+ x(BCH_ERR_data_write, data_write_invalid_ptr) \
+ x(BCH_ERR_data_write, data_write_misaligned) \
+ x(BCH_ERR_decompress, data_read) \
x(BCH_ERR_data_read, no_device_to_read_from) \
x(BCH_ERR_data_read, data_read_io_err) \
x(BCH_ERR_data_read, data_read_csum_err) \
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 6d68c89a..207f35d3 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -3,8 +3,8 @@
#include "btree_cache.h"
#include "btree_iter.h"
#include "error.h"
-#include "fs-common.h"
#include "journal.h"
+#include "namei.h"
#include "recovery_passes.h"
#include "super.h"
#include "thread_with_file.h"
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 04946d99..ca2073db 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -136,12 +136,8 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (k.k->type == KEY_TYPE_error)
return -BCH_ERR_key_type_error;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return -BCH_ERR_extent_poisened;
-
rcu_read_lock();
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 pick_latency;
@@ -592,29 +588,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
struct bch_extent_crc_unpacked src,
enum bch_extent_entry_type type)
{
-#define set_common_fields(_dst, _src) \
- _dst.type = 1 << type; \
- _dst.csum_type = _src.csum_type, \
- _dst.compression_type = _src.compression_type, \
- _dst._compressed_size = _src.compressed_size - 1, \
- _dst._uncompressed_size = _src.uncompressed_size - 1, \
- _dst.offset = _src.offset
+#define common_fields(_src) \
+ .type = BIT(type), \
+ .csum_type = _src.csum_type, \
+ .compression_type = _src.compression_type, \
+ ._compressed_size = _src.compressed_size - 1, \
+ ._uncompressed_size = _src.uncompressed_size - 1, \
+ .offset = _src.offset
switch (type) {
case BCH_EXTENT_ENTRY_crc32:
- set_common_fields(dst->crc32, src);
- dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo);
+ dst->crc32 = (struct bch_extent_crc32) {
+ common_fields(src),
+ .csum = (u32 __force) *((__le32 *) &src.csum.lo),
+ };
break;
case BCH_EXTENT_ENTRY_crc64:
- set_common_fields(dst->crc64, src);
- dst->crc64.nonce = src.nonce;
- dst->crc64.csum_lo = (u64 __force) src.csum.lo;
- dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi);
+ dst->crc64 = (struct bch_extent_crc64) {
+ common_fields(src),
+ .nonce = src.nonce,
+ .csum_lo = (u64 __force) src.csum.lo,
+ .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi),
+ };
break;
case BCH_EXTENT_ENTRY_crc128:
- set_common_fields(dst->crc128, src);
- dst->crc128.nonce = src.nonce;
- dst->crc128.csum = src.csum;
+ dst->crc128 = (struct bch_extent_crc128) {
+ common_fields(src),
+ .nonce = src.nonce,
+ .csum = src.csum,
+ };
break;
default:
BUG();
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
index 5ab1c73c..a03e2c78 100644
--- a/libbcachefs/fs-io-buffered.c
+++ b/libbcachefs/fs-io-buffered.c
@@ -225,11 +225,11 @@ static void bchfs_read(struct btree_trans *trans,
bch2_read_extent(trans, rbio, iter.pos,
data_btree, k, offset_into_extent, flags);
+ swap(rbio->bio.bi_iter.bi_size, bytes);
if (flags & BCH_READ_last_fragment)
break;
- swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
err:
if (ret &&
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index 5b47b94f..e3a3230f 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -5,8 +5,8 @@
#include "chardev.h"
#include "dirent.h"
#include "fs.h"
-#include "fs-common.h"
#include "fs-ioctl.h"
+#include "namei.h"
#include "quota.h"
#include <linux/compat.h>
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 4453dd2f..94e97e28 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -11,7 +11,6 @@
#include "errcode.h"
#include "extents.h"
#include "fs.h"
-#include "fs-common.h"
#include "fs-io.h"
#include "fs-ioctl.h"
#include "fs-io-buffered.h"
@@ -22,6 +21,7 @@
#include "io_read.h"
#include "journal.h"
#include "keylist.h"
+#include "namei.h"
#include "quota.h"
#include "rebalance.h"
#include "snapshot.h"
@@ -641,7 +641,9 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
- ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+ ret = bch2_dirent_read_target(trans, dir, d, &inum);
if (ret > 0)
ret = -ENOENT;
if (ret)
@@ -651,30 +653,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
if (inode)
goto out;
+ /*
+ * Note: if check/repair needs it, we commit before
+ * bch2_inode_hash_init_insert(), as after that point we can't take a
+ * restart - not in the top level loop with a commit_do(), like we
+ * usually do:
+ */
+
struct bch_subvolume subvol;
struct bch_inode_unpacked inode_u;
ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
+ bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
+ /*
+ * don't remove it: check_inodes might find another inode that points
+ * back to this dirent
+ */
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
c, "dirent to missing inode:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf));
if (ret)
goto err;
-
- /* regular files may have hardlinks: */
- if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) &&
- !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
- c,
- "dirent points to inode that does not point back:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k),
- prt_printf(&buf, "\n "),
- bch2_inode_unpacked_to_text(&buf, &inode_u),
- buf.buf))) {
- ret = -ENOENT;
- goto err;
- }
out:
bch2_trans_iter_exit(trans, &dirent_iter);
printbuf_exit(&buf);
@@ -2177,7 +2179,7 @@ static int bch2_fs_get_tree(struct fs_context *fc)
/* Some options can't be parsed until after the fs is started: */
opts = bch2_opts_empty();
- ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
+ ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false);
if (ret)
goto err_stop_fs;
@@ -2331,6 +2333,8 @@ static int bch2_fs_parse_param(struct fs_context *fc,
int ret = bch2_parse_one_mount_opt(c, &opts->opts,
&opts->parse_later, param->key,
param->string);
+ if (ret)
+ pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret));
return bch2_err_class(ret);
}
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 0e85131d..f955b8f9 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -10,10 +10,10 @@
#include "dirent.h"
#include "error.h"
#include "fs.h"
-#include "fs-common.h"
#include "fsck.h"
#include "inode.h"
#include "keylist.h"
+#include "namei.h"
#include "recovery_passes.h"
#include "snapshot.h"
#include "super.h"
@@ -23,13 +23,6 @@
#include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
- struct bkey_s_c_dirent d)
-{
- return inode->bi_dir == d.k->p.inode &&
- inode->bi_dir_offset == d.k->p.offset;
-}
-
static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
struct bch_inode_unpacked *inode)
{
@@ -116,29 +109,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
return ret;
}
-static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inode_nr)
- break;
- if (!bkey_is_inode(k.k))
- continue;
- ret = bch2_inode_unpack(k, inode);
- goto found;
- }
- ret = -BCH_ERR_ENOENT_inode;
-found:
- bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
struct bch_inode_unpacked *inode)
{
@@ -179,32 +149,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
return 0;
}
-static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bch_inode_unpacked dir_inode;
- struct bch_hash_info dir_hash_info;
- int ret;
-
- ret = lookup_first_inode(trans, pos.inode, &dir_inode);
- if (ret)
- goto err;
-
- dir_hash_info = bch2_hash_info_init(c, &dir_inode);
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
-
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &iter);
-err:
- bch_err_fn(c, ret);
- return ret;
-}
-
/*
* Find any subvolume associated with a tree of snapshots
* We can't rely on master_subvol - it might have been deleted.
@@ -548,7 +492,7 @@ static int remove_backpointer(struct btree_trans *trans,
SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
int ret = bkey_err(d) ?:
dirent_points_to_inode(c, d, inode) ?:
- __remove_dirent(trans, d.k->p);
+ bch2_fsck_remove_dirent(trans, d.k->p);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1985,169 +1929,6 @@ static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_wa
trans_was_restarted(trans, restart_count);
}
-noinline_for_stack
-static int check_dirent_inode_dirent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct btree_iter bp_iter = { NULL };
- int ret = 0;
-
- if (inode_points_to_dirent(target, d))
- return 0;
-
- if (!target->bi_dir &&
- !target->bi_dir_offset) {
- fsck_err_on(S_ISDIR(target->bi_mode),
- trans, inode_dir_missing_backpointer,
- "directory with missing backpointer\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n"),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf));
-
- fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
- trans, inode_unlinked_but_has_dirent,
- "inode unlinked but has dirent\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n"),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf));
-
- target->bi_flags &= ~BCH_INODE_unlinked;
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
- return __bch2_fsck_write_inode(trans, target);
- }
-
- if (bch2_inode_should_have_single_bp(target) &&
- !fsck_err(trans, inode_wrong_backpointer,
- "dirent points to inode that does not point back:\n %s",
- (bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n "),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf)))
- goto err;
-
- struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
- SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot));
- ret = bkey_err(bp_dirent);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- bool backpointer_exists = !ret;
- ret = 0;
-
- if (fsck_err_on(!backpointer_exists,
- trans, inode_wrong_backpointer,
- "inode %llu:%u has wrong backpointer:\n"
- "got %llu:%llu\n"
- "should be %llu:%llu",
- target->bi_inum, target->bi_snapshot,
- target->bi_dir,
- target->bi_dir_offset,
- d.k->p.inode,
- d.k->p.offset)) {
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
- ret = __bch2_fsck_write_inode(trans, target);
- goto out;
- }
-
- bch2_bkey_val_to_text(&buf, c, d.s_c);
- prt_newline(&buf);
- if (backpointer_exists)
- bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
-
- if (fsck_err_on(backpointer_exists &&
- (S_ISDIR(target->bi_mode) ||
- target->bi_subvol),
- trans, inode_dir_multiple_links,
- "%s %llu:%u with multiple links\n%s",
- S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
- target->bi_inum, target->bi_snapshot, buf.buf)) {
- ret = __remove_dirent(trans, d.k->p);
- goto out;
- }
-
- /*
- * hardlinked file with nlink 0:
- * We're just adjusting nlink here so check_nlinks() will pick
- * it up, it ignores inodes with nlink 0
- */
- if (fsck_err_on(backpointer_exists && !target->bi_nlink,
- trans, inode_multiple_links_but_nlink_0,
- "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
- target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
- target->bi_nlink++;
- target->bi_flags &= ~BCH_INODE_unlinked;
- ret = __bch2_fsck_write_inode(trans, target);
- if (ret)
- goto err;
- }
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &bp_iter);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-noinline_for_stack
-static int check_dirent_target(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i_dirent *n;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- ret = check_dirent_inode_dirent(trans, iter, d, target);
- if (ret)
- goto err;
-
- if (fsck_err_on(d.v->d_type != inode_d_type(target),
- trans, dirent_d_type_wrong,
- "incorrect d_type: got %s, should be %s:\n%s",
- bch2_d_type_str(d.v->d_type),
- bch2_d_type_str(inode_d_type(target)),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
- n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_type = inode_d_type(target);
- if (n->v.d_type == DT_SUBVOL) {
- n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
- n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
- } else {
- n->v.d_inum = cpu_to_le64(target->bi_inum);
- }
-
- ret = bch2_trans_update(trans, iter, &n->k_i, 0);
- if (ret)
- goto err;
-
- d = dirent_i_to_s_c(n);
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
/* find a subvolume that's a descendent of @snapshot: */
static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
{
@@ -2247,7 +2028,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
if (fsck_err(trans, dirent_to_missing_subvol,
"dirent points to missing subvolume\n%s",
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
- return __remove_dirent(trans, d.k->p);
+ return bch2_fsck_remove_dirent(trans, d.k->p);
ret = 0;
goto out;
}
@@ -2291,7 +2072,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
goto err;
}
- ret = check_dirent_target(trans, iter, d, &subvol_root);
+ ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
if (ret)
goto err;
out:
@@ -2378,13 +2159,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
- ret = __remove_dirent(trans, d.k->p);
+ ret = bch2_fsck_remove_dirent(trans, d.k->p);
if (ret)
goto err;
}
darray_for_each(target->inodes, i) {
- ret = check_dirent_target(trans, iter, d, &i->inode);
+ ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
if (ret)
goto err;
}
@@ -3240,7 +3021,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
if (arg.opts) {
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
+ bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false);
if (!IS_ERR(optstr))
kfree(optstr);
@@ -3348,7 +3129,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
+ bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false);
if (!IS_ERR(optstr))
kfree(optstr);
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 7aca010e..80051073 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -731,10 +731,9 @@ int bch2_trigger_inode(struct btree_trans *trans,
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
}
- s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
- if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) {
- struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes };
- int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc);
+ s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) };
+ if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) {
+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes);
if (ret)
return ret;
}
@@ -1079,7 +1078,7 @@ retry:
bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum.inum, snapshot);
- ret = -EIO;
+ ret = -BCH_ERR_ENOENT_inode;
goto err;
}
@@ -1243,7 +1242,7 @@ retry:
bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum, snapshot);
- ret = -EIO;
+ ret = -BCH_ERR_ENOENT_inode;
goto err;
}
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 428b9be6..f82cfbf4 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -277,6 +277,7 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i
bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
return S_ISDIR(inode->bi_mode) ||
+ inode->bi_subvol ||
(!inode->bi_nlink && inode_has_bp);
}
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index 4fb279f1..a04dffa4 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -295,6 +295,13 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
bool *read_full,
struct bch_io_failures *failed)
{
+ /*
+ * We're in the retry path, but we don't know what to repair yet, and we
+ * don't want to do a promote here:
+ */
+ if (failed && !failed->nr)
+ return NULL;
+
struct bch_fs *c = trans->c;
/*
* if failed != NULL we're not actually doing a promote, we're
@@ -429,6 +436,71 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
bio_endio(&rbio->bio);
}
+static void get_rbio_extent(struct btree_trans *trans,
+ struct bch_read_bio *rbio,
+ struct bkey_buf *sk)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = lockrestart_do(trans,
+ bkey_err(k = bch2_bkey_get_iter(trans, &iter,
+ rbio->data_btree, rbio->data_pos, 0)));
+ if (ret)
+ return;
+
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr(ptrs, ptr)
+ if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) {
+ bch2_bkey_buf_reassemble(sk, trans->c, k);
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+}
+
+static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
+ enum btree_id btree, struct bkey_s_c read_k)
+{
+ struct bch_fs *c = trans->c;
+
+ struct data_update *u = rbio_data_update(rbio);
+ if (u)
+ read_k = bkey_i_to_s_c(u->k.k);
+
+ u64 flags = bch2_bkey_extent_flags(read_k);
+ if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+ return 0;
+
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k),
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!bkey_and_val_eq(k, read_k))
+ goto out;
+
+ struct bkey_i *new = bch2_trans_kmalloc(trans,
+ bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
+ ret = PTR_ERR_OR_ZERO(new) ?:
+ (bkey_reassemble(new, k), 0) ?:
+ bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
+ bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+
+ /*
+ * Propagate key change back to data update path, in particular so it
+ * knows the extent has been poisoned and it's safe to change the
+ * checksum
+ */
+ if (u && !ret)
+ bch2_bkey_buf_copy(&u->k, c, new);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
struct bch_read_bio *rbio,
struct bvec_iter bvec_iter,
@@ -462,7 +534,8 @@ retry:
err:
bch2_trans_iter_exit(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret, BCH_ERR_data_read_retry))
goto retry;
if (ret) {
@@ -486,13 +559,21 @@ static void bch2_rbio_retry(struct work_struct *work)
.inum = rbio->read_pos.inode,
};
struct bch_io_failures failed = { .nr = 0 };
+
struct btree_trans *trans = bch2_trans_get(c);
+ struct bkey_buf sk;
+ bch2_bkey_buf_init(&sk);
+ bkey_init(&sk.k->k);
+
trace_io_read_retry(&rbio->bio);
this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
bvec_iter_sectors(rbio->bvec_iter));
- if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
+ get_rbio_extent(trans, rbio, &sk);
+
+ if (!bkey_deleted(&sk.k->k) &&
+ bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
bch2_mark_io_failure(&failed, &rbio->pick,
rbio->ret == -BCH_ERR_data_read_retry_csum_err);
@@ -513,7 +594,7 @@ static void bch2_rbio_retry(struct work_struct *work)
int ret = rbio->data_update
? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
- : __bch2_read(trans, rbio, iter, inum, &failed, flags);
+ : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags);
if (ret) {
rbio->ret = ret;
@@ -534,6 +615,7 @@ static void bch2_rbio_retry(struct work_struct *work)
}
bch2_rbio_done(rbio);
+ bch2_bkey_buf_exit(&sk, c);
bch2_trans_put(trans);
}
@@ -958,6 +1040,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
bvec_iter_sectors(iter));
goto out_read_done;
}
+
+ if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) &&
+ !orig->data_update)
+ return -BCH_ERR_extent_poisened;
retry_pick:
ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
@@ -966,6 +1052,16 @@ retry_pick:
goto hole;
if (unlikely(ret < 0)) {
+ if (ret == -BCH_ERR_data_read_csum_err) {
+ int ret2 = maybe_poison_extent(trans, orig, data_btree, k);
+ if (ret2) {
+ ret = ret2;
+ goto err;
+ }
+
+ trace_and_count(c, io_read_fail_and_poison, &orig->bio);
+ }
+
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
prt_printf(&buf, "%s\n ", bch2_err_str(ret));
@@ -1263,12 +1359,15 @@ out_read_done:
int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
struct bvec_iter bvec_iter, subvol_inum inum,
- struct bch_io_failures *failed, unsigned flags)
+ struct bch_io_failures *failed,
+ struct bkey_buf *prev_read,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
+ enum btree_id data_btree;
int ret;
EBUG_ON(rbio->data_update);
@@ -1279,7 +1378,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
BTREE_ITER_slots);
while (1) {
- enum btree_id data_btree = BTREE_ID_extents;
+ data_btree = BTREE_ID_extents;
bch2_trans_begin(trans);
@@ -1311,6 +1410,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
k = bkey_i_to_s_c(sk.k);
+ if (unlikely(flags & BCH_READ_in_retry)) {
+ if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k)))
+ failed->nr = 0;
+ bch2_bkey_buf_copy(prev_read, c, sk.k);
+ }
+
/*
* With indirect extents, the amount of data to read is the min
* of the original extent and the indirect extent:
@@ -1326,13 +1431,14 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
data_btree, k,
offset_into_extent, failed, flags, -1);
+ swap(bvec_iter.bi_size, bytes);
+
if (ret)
goto err;
if (flags & BCH_READ_last_fragment)
break;
- swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
err:
if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
@@ -1344,9 +1450,7 @@ err:
break;
}
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret) {
+ if (unlikely(ret)) {
struct printbuf buf = PRINTBUF;
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf, inum,
@@ -1362,6 +1466,7 @@ err:
bch2_rbio_done(rbio);
}
+ bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c);
return ret;
}
diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h
index cd219504..1a85b092 100644
--- a/libbcachefs/io_read.h
+++ b/libbcachefs/io_read.h
@@ -137,12 +137,15 @@ static inline void bch2_read_extent(struct btree_trans *trans,
enum btree_id data_btree, struct bkey_s_c k,
unsigned offset_into_extent, unsigned flags)
{
- __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
- data_btree, k, offset_into_extent, NULL, flags, -1);
+ int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+ data_btree, k, offset_into_extent, NULL, flags, -1);
+ /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */
+ WARN(ret, "unhandled error from __bch2_read_extent()");
}
int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
- subvol_inum, struct bch_io_failures *, unsigned flags);
+ subvol_inum,
+ struct bch_io_failures *, struct bkey_buf *, unsigned flags);
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
subvol_inum inum)
@@ -152,7 +155,7 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
rbio->subvol = inum.subvol;
bch2_trans_run(c,
- __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL,
+ __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL,
BCH_READ_retry_if_stale|
BCH_READ_may_promote|
BCH_READ_user_mapped));
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index a2e6b305..07b55839 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -402,61 +402,36 @@ static int bch2_write_index_default(struct bch_write_op *op)
/* Writes */
-void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
- struct bch_write_op *op, u64 offset, const char *fmt, ...)
+void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
{
- if (op->subvol)
- lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, out,
- (subvol_inum) { op->subvol, op->pos.inode, },
- offset << 9));
- else {
- struct bpos pos = op->pos;
- pos.offset = offset;
- lockrestart_do(trans, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
- }
-
- prt_str(out, "write error: ");
-
- va_list args;
- va_start(args, fmt);
- prt_vprintf(out, fmt, args);
- va_end(args);
-
- if (op->flags & BCH_WRITE_move) {
- struct data_update *u = container_of(op, struct data_update, op);
-
- prt_printf(out, "\n from internal move ");
- bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k));
- }
-}
+ struct printbuf buf = PRINTBUF;
-void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64 offset,
- const char *fmt, ...)
-{
- if (op->subvol)
- bch2_inum_offset_err_msg(op->c, out,
+ if (op->subvol) {
+ bch2_inum_offset_err_msg(op->c, &buf,
(subvol_inum) { op->subvol, op->pos.inode, },
offset << 9);
- else {
+ } else {
struct bpos pos = op->pos;
pos.offset = offset;
- bch2_inum_snap_offset_err_msg(op->c, out, pos);
+ bch2_inum_snap_offset_err_msg(op->c, &buf, pos);
}
- prt_str(out, "write error: ");
+ prt_str(&buf, "write error: ");
va_list args;
va_start(args, fmt);
- prt_vprintf(out, fmt, args);
+ prt_vprintf(&buf, fmt, args);
va_end(args);
if (op->flags & BCH_WRITE_move) {
struct data_update *u = container_of(op, struct data_update, op);
- prt_printf(out, "\n from internal move ");
- bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k));
+ prt_printf(&buf, "\n from internal move ");
+ bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k));
}
+
+ bch_err_ratelimited(op->c, "%s", buf.buf);
+ printbuf_exit(&buf);
}
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
@@ -554,7 +529,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
test_bit(ptr->dev, op->failed.d));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
- return -EIO;
+ return -BCH_ERR_data_write_io;
}
if (dst != src)
@@ -598,11 +573,8 @@ static void __bch2_write_index(struct bch_write_op *op)
if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k),
+ bch2_write_op_error(op, bkey_start_offset(&insert->k),
"btree update error: %s", bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
}
if (ret)
@@ -611,7 +583,7 @@ static void __bch2_write_index(struct bch_write_op *op)
out:
/* If some a bucket wasn't written, we can't erasure code it: */
for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
- bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+ bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io);
bch2_open_buckets_put(c, &op->open_buckets);
return;
@@ -837,7 +809,6 @@ static int bch2_write_rechecksum(struct bch_fs *c,
{
struct bio *bio = &op->wbio.bio;
struct bch_extent_crc_unpacked new_crc;
- int ret;
/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
@@ -845,10 +816,10 @@ static int bch2_write_rechecksum(struct bch_fs *c,
bch2_csum_type_is_encryption(new_csum_type))
new_csum_type = op->crc.csum_type;
- ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
- NULL, &new_crc,
- op->crc.offset, op->crc.live_size,
- new_csum_type);
+ int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+ NULL, &new_crc,
+ op->crc.offset, op->crc.live_size,
+ new_csum_type);
if (ret)
return ret;
@@ -858,44 +829,12 @@ static int bch2_write_rechecksum(struct bch_fs *c,
return 0;
}
-static int bch2_write_decrypt(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct nonce nonce = extent_nonce(op->version, op->crc);
- struct bch_csum csum;
- int ret;
-
- if (!bch2_csum_type_is_encryption(op->crc.csum_type))
- return 0;
-
- /*
- * If we need to decrypt data in the write path, we'll no longer be able
- * to verify the existing checksum (poly1305 mac, in this case) after
- * it's decrypted - this is the last point we'll be able to reverify the
- * checksum:
- */
- csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
- if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- return -EIO;
-
- ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
- op->crc.csum_type = 0;
- op->crc.csum = (struct bch_csum) { 0, 0 };
- return ret;
-}
-
-static enum prep_encoded_ret {
- PREP_ENCODED_OK,
- PREP_ENCODED_ERR,
- PREP_ENCODED_CHECKSUM_ERR,
- PREP_ENCODED_DO_WRITE,
-} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
{
struct bch_fs *c = op->c;
struct bio *bio = &op->wbio.bio;
-
- if (!(op->flags & BCH_WRITE_data_encoded))
- return PREP_ENCODED_OK;
+ struct bch_csum csum;
+ int ret = 0;
BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
@@ -906,12 +845,13 @@ static enum prep_encoded_ret {
(op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
op->incompressible)) {
if (!crc_is_compressed(op->crc) &&
- op->csum_type != op->crc.csum_type &&
- bch2_write_rechecksum(c, op, op->csum_type) &&
- !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
+ op->csum_type != op->crc.csum_type) {
+ ret = bch2_write_rechecksum(c, op, op->csum_type);
+ if (ret)
+ return ret;
+ }
- return PREP_ENCODED_DO_WRITE;
+ return 1;
}
/*
@@ -919,20 +859,24 @@ static enum prep_encoded_ret {
* is, we have to decompress it:
*/
if (crc_is_compressed(op->crc)) {
- struct bch_csum csum;
-
- if (bch2_write_decrypt(op))
- return PREP_ENCODED_CHECKSUM_ERR;
-
/* Last point we can still verify checksum: */
- csum = bch2_checksum_bio(c, op->crc.csum_type,
- extent_nonce(op->version, op->crc),
- bio);
+ struct nonce nonce = extent_nonce(op->version, op->crc);
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
+ goto csum_err;
+
+ if (bch2_csum_type_is_encryption(op->crc.csum_type)) {
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
+ if (ret)
+ return ret;
- if (bch2_bio_uncompress_inplace(op, bio))
- return PREP_ENCODED_ERR;
+ op->crc.csum_type = 0;
+ op->crc.csum = (struct bch_csum) { 0, 0 };
+ }
+
+ ret = bch2_bio_uncompress_inplace(op, bio);
+ if (ret)
+ return ret;
}
/*
@@ -944,22 +888,44 @@ static enum prep_encoded_ret {
* If the data is checksummed and we're only writing a subset,
* rechecksum and adjust bio to point to currently live data:
*/
- if ((op->crc.live_size != op->crc.uncompressed_size ||
- op->crc.csum_type != op->csum_type) &&
- bch2_write_rechecksum(c, op, op->csum_type) &&
- !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
+ if (op->crc.live_size != op->crc.uncompressed_size ||
+ op->crc.csum_type != op->csum_type) {
+ ret = bch2_write_rechecksum(c, op, op->csum_type);
+ if (ret)
+ return ret;
+ }
/*
* If we want to compress the data, it has to be decrypted:
*/
- if ((op->compression_opt ||
- bch2_csum_type_is_encryption(op->crc.csum_type) !=
- bch2_csum_type_is_encryption(op->csum_type)) &&
- bch2_write_decrypt(op))
- return PREP_ENCODED_CHECKSUM_ERR;
+ if (bch2_csum_type_is_encryption(op->crc.csum_type) &&
+ (op->compression_opt || op->crc.csum_type != op->csum_type)) {
+ struct nonce nonce = extent_nonce(op->version, op->crc);
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+ goto csum_err;
+
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
+ if (ret)
+ return ret;
- return PREP_ENCODED_OK;
+ op->crc.csum_type = 0;
+ op->crc.csum = (struct bch_csum) { 0, 0 };
+ }
+
+ return 0;
+csum_err:
+ bch2_write_op_error(op, op->pos.offset,
+ "error verifying existing checksum while moving existing data (memory corruption?)\n"
+ " expected %0llx:%0llx got %0llx:%0llx type %s",
+ op->crc.csum.hi,
+ op->crc.csum.lo,
+ csum.hi,
+ csum.lo,
+ op->crc.csum_type < BCH_CSUM_NR
+ ? __bch2_csum_types[op->crc.csum_type]
+ : "(unknown)");
+ return -BCH_ERR_data_write_csum;
}
static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
@@ -974,29 +940,28 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
bool page_alloc_failed = false;
int ret, more = 0;
+ if (op->incompressible)
+ op->compression_opt = 0;
+
BUG_ON(!bio_sectors(src));
ec_buf = bch2_writepoint_ec_buf(c, wp);
- switch (bch2_write_prep_encoded_data(op, wp)) {
- case PREP_ENCODED_OK:
- break;
- case PREP_ENCODED_ERR:
- ret = -EIO;
- goto err;
- case PREP_ENCODED_CHECKSUM_ERR:
- goto csum_err;
- case PREP_ENCODED_DO_WRITE:
- /* XXX look for bug here */
- if (ec_buf) {
- dst = bch2_write_bio_alloc(c, wp, src,
- &page_alloc_failed,
- ec_buf);
- bio_copy_data(dst, src);
- bounce = true;
+ if (unlikely(op->flags & BCH_WRITE_data_encoded)) {
+ ret = bch2_write_prep_encoded_data(op, wp);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ if (ec_buf) {
+ dst = bch2_write_bio_alloc(c, wp, src,
+ &page_alloc_failed,
+ ec_buf);
+ bio_copy_data(dst, src);
+ bounce = true;
+ }
+ init_append_extent(op, wp, op->version, op->crc);
+ goto do_write;
}
- init_append_extent(op, wp, op->version, op->crc);
- goto do_write;
}
if (ec_buf ||
@@ -1089,12 +1054,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
* data can't be modified (by userspace) while it's in
* flight.
*/
- if (bch2_rechecksum_bio(c, src, version, op->crc,
+ ret = bch2_rechecksum_bio(c, src, version, op->crc,
&crc, &op->crc,
src_len >> 9,
bio_sectors(src) - (src_len >> 9),
- op->csum_type))
- goto csum_err;
+ op->csum_type);
+ if (ret)
+ goto err;
/*
* rchecksum_bio sets compression_type on crc from op->crc,
* this isn't always correct as sometimes we're changing
@@ -1104,12 +1070,12 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
crc.nonce = nonce;
} else {
if ((op->flags & BCH_WRITE_data_encoded) &&
- bch2_rechecksum_bio(c, src, version, op->crc,
+ (ret = bch2_rechecksum_bio(c, src, version, op->crc,
NULL, &op->crc,
src_len >> 9,
bio_sectors(src) - (src_len >> 9),
- op->crc.csum_type))
- goto csum_err;
+ op->crc.csum_type)))
+ goto err;
crc.compressed_size = dst_len >> 9;
crc.uncompressed_size = src_len >> 9;
@@ -1168,16 +1134,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
do_write:
*_dst = dst;
return more;
-csum_err:
- {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op, op->pos.offset,
- "error verifying existing checksum while rewriting existing data (memory corruption?)");
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- ret = -EIO;
err:
if (to_wbio(dst)->bounce)
bch2_bio_free_pages_pool(c, dst);
@@ -1255,38 +1211,35 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct btree_trans *trans = bch2_trans_get(c);
+ int ret = 0;
for_each_keylist_key(&op->insert_keys, orig) {
- int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
+ ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
}));
-
- if (ret && !bch2_err_matches(ret, EROFS)) {
- struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
-
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k),
- "btree update error: %s", bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- if (ret) {
- op->error = ret;
+ if (ret)
break;
- }
}
bch2_trans_put(trans);
+
+ if (ret && !bch2_err_matches(ret, EROFS)) {
+ struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+ bch2_write_op_error(op, bkey_start_offset(&insert->k),
+ "btree update error: %s", bch2_err_str(ret));
+ }
+
+ if (ret)
+ op->error = ret;
}
static void __bch2_nocow_write_done(struct bch_write_op *op)
{
if (unlikely(op->flags & BCH_WRITE_io_error)) {
- op->error = -EIO;
+ op->error = -BCH_ERR_data_write_io;
} else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
bch2_nocow_write_convert_unwritten(op);
}
@@ -1436,11 +1389,8 @@ err:
darray_exit(&buckets);
if (ret) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op, op->pos.offset,
+ bch2_write_op_error(op, op->pos.offset,
"%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
op->error = ret;
op->flags |= BCH_WRITE_submitted;
}
@@ -1480,7 +1430,7 @@ err_bucket_stale:
"pointer to invalid bucket in nocow path on device %llu\n %s",
stale_at->b.inode,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_data_write_invalid_ptr;
} else {
/* We can retry this: */
ret = -BCH_ERR_transaction_restart;
@@ -1558,13 +1508,9 @@ err:
op->flags |= BCH_WRITE_submitted;
if (unlikely(ret < 0)) {
- if (!(op->flags & BCH_WRITE_alloc_nowait)) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op, op->pos.offset,
+ if (!(op->flags & BCH_WRITE_alloc_nowait))
+ bch2_write_op_error(op, op->pos.offset,
"%s(): %s", __func__, bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
op->error = ret;
break;
}
@@ -1691,11 +1637,8 @@ CLOSURE_CALLBACK(bch2_write)
wbio_init(bio)->put_bio = false;
if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op, op->pos.offset,
- "misaligned write");
- printbuf_exit(&buf);
- op->error = -EIO;
+ bch2_write_op_error(op, op->pos.offset, "misaligned write");
+ op->error = -BCH_ERR_data_write_misaligned;
goto err;
}
diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h
index 62773053..b8ab19a1 100644
--- a/libbcachefs/io_write.h
+++ b/libbcachefs/io_write.h
@@ -14,13 +14,8 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *, bool);
-__printf(5, 6)
-void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
- struct bch_write_op *op, u64, const char *, ...);
-
-__printf(4, 5)
-void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64,
- const char *, ...);
+__printf(3, 4)
+void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...);
#define BCH_WRITE_FLAGS() \
x(alloc_nowait) \
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index ce730269..ecb97d43 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -62,8 +62,7 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
prt_newline(out);
}
- prt_printf(out, "expires:\t");
- prt_printf(out, "%li jiffies\n", buf->expires - jiffies);
+ prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies);
prt_printf(out, "flags:\t");
if (buf->noflush)
@@ -142,6 +141,8 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
bool stuck = false;
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
if (!(error == -BCH_ERR_journal_full ||
error == -BCH_ERR_journal_pin_full) ||
nr_unwritten_journal_entries(j) ||
@@ -172,7 +173,7 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
bch2_err_str(error));
bch2_journal_debug_to_text(&buf, j);
- bch_err(c, "%s", buf.buf);
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_reset(&buf);
bch2_journal_pins_to_text(&buf, j);
@@ -726,10 +727,10 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
remaining_wait))
return ret;
+ bch_err(c, "Journal stuck? Waited for 10 seconds, err %s", bch2_err_str(ret));
struct printbuf buf = PRINTBUF;
bch2_journal_debug_to_text(&buf, j);
- bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
- buf.buf);
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_exit(&buf);
closure_wait_event(&j->async_wait,
@@ -1510,7 +1511,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
- ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+ ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
nr_bvecs), GFP_KERNEL);
if (!ja->bio[i])
return -BCH_ERR_ENOMEM_dev_journal_init;
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index cf2700b0..4ed6137f 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1624,7 +1624,7 @@ static CLOSURE_CALLBACK(journal_write_done)
if (!w->devs_written.nr) {
bch_err(c, "unable to write journal to sufficient devices");
- err = -EIO;
+ err = -BCH_ERR_journal_write_err;
} else {
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
w->devs_written);
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 3ed31492..5d1547aa 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -645,7 +645,6 @@ static u64 journal_seq_to_flush(struct journal *j)
* @j: journal object
* @direct: direct or background reclaim?
* @kicked: requested to run since we last ran?
- * Returns: 0 on success, or -EIO if the journal has been shutdown
*
* Background journal reclaim writes out btree nodes. It should be run
* early enough so that we never completely run out of journal buckets.
@@ -685,10 +684,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (kthread && kthread_should_stop())
break;
- if (bch2_journal_error(j)) {
- ret = -EIO;
+ ret = bch2_journal_error(j);
+ if (ret)
break;
- }
bch2_journal_do_discards(j);
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 8fcdc698..66d1c055 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -126,26 +126,40 @@ static void move_write_done(struct bch_write_op *op)
static void move_write(struct moving_io *io)
{
+ struct bch_fs *c = io->write.op.c;
struct moving_context *ctxt = io->write.ctxt;
+ struct bch_read_bio *rbio = &io->write.rbio;
if (ctxt->stats) {
- if (io->write.rbio.bio.bi_status)
+ if (rbio->bio.bi_status)
atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
&ctxt->stats->sectors_error_uncorrected);
- else if (io->write.rbio.saw_error)
+ else if (rbio->saw_error)
atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
&ctxt->stats->sectors_error_corrected);
}
- if (unlikely(io->write.rbio.ret ||
- io->write.rbio.bio.bi_status ||
- io->write.data_opts.scrub)) {
+ /*
+ * If the extent has been bitrotted, we're going to have to give it a
+ * new checksum in order to move it - but the poison bit will ensure
+ * that userspace still gets the appropriate error.
+ */
+ if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
+ (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
+ struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+ struct nonce nonce = extent_nonce(rbio->version, crc);
+
+ rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type,
+ nonce, &rbio->bio);
+ rbio->ret = 0;
+ }
+
+ if (unlikely(rbio->ret || io->write.data_opts.scrub)) {
move_free(io);
return;
}
if (trace_io_move_write_enabled()) {
- struct bch_fs *c = io->write.op.c;
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
@@ -528,6 +542,37 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
return 0;
}
+/*
+ * Move requires non extents iterators, and there's also no need for it to
+ * signal indirect_extent_missing_error:
+ */
+static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_reflink_p p)
+{
+ if (unlikely(REFLINK_P_ERROR(p.v)))
+ return bkey_s_c_null;
+
+ struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v));
+
+ bch2_trans_iter_init(trans, iter,
+ BTREE_ID_reflink, reflink_pos,
+ BTREE_ITER_not_extents);
+
+ struct bkey_s_c k = bch2_btree_iter_peek(iter);
+ if (!k.k || bkey_err(k)) {
+ bch2_trans_iter_exit(trans, iter);
+ return k;
+ }
+
+ if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) {
+ bch2_trans_iter_exit(trans, iter);
+ return bkey_s_c_null;
+ }
+
+ return k;
+}
+
static int bch2_move_data_btree(struct moving_context *ctxt,
struct bpos start,
struct bpos end,
@@ -592,17 +637,16 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
k.k->type == KEY_TYPE_reflink_p &&
REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- s64 offset_into_extent = 0;
bch2_trans_iter_exit(trans, &reflink_iter);
- k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
+ k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
- if (bkey_deleted(k.k))
+ if (!k.k)
goto next_nondata;
/*
@@ -611,7 +655,6 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
* pointer - need to fixup iter->k
*/
extent_iter = &reflink_iter;
- offset_into_extent = 0;
}
if (!bkey_extent_is_direct_data(k.k))
diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h
index 82e473ed..807f779f 100644
--- a/libbcachefs/move_types.h
+++ b/libbcachefs/move_types.h
@@ -32,7 +32,7 @@ struct bch_move_stats {
struct move_bucket_key {
struct bpos bucket;
- u8 gen;
+ unsigned gen;
};
struct move_bucket {
diff --git a/libbcachefs/fs-common.c b/libbcachefs/namei.c
index fbc3da59..93246ad3 100644
--- a/libbcachefs/fs-common.c
+++ b/libbcachefs/namei.c
@@ -4,8 +4,8 @@
#include "acl.h"
#include "btree_update.h"
#include "dirent.h"
-#include "fs-common.h"
#include "inode.h"
+#include "namei.h"
#include "subvolume.h"
#include "xattr.h"
@@ -564,6 +564,8 @@ err:
return ret;
}
+/* inum_to_path */
+
static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
{
bch2_printbuf_make_room(out, n);
@@ -654,3 +656,179 @@ disconnected:
prt_str_reversed(path, "(disconnected)");
goto out;
}
+
+/* fsck */
+
+static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ bool in_fsck)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ struct btree_iter bp_iter = { NULL };
+ int ret = 0;
+
+ if (inode_points_to_dirent(target, d))
+ return 0;
+
+ if (!target->bi_dir &&
+ !target->bi_dir_offset) {
+ fsck_err_on(S_ISDIR(target->bi_mode),
+ trans, inode_dir_missing_backpointer,
+ "directory with missing backpointer\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n"),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf));
+
+ fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
+ trans, inode_unlinked_but_has_dirent,
+ "inode unlinked but has dirent\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n"),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf));
+
+ target->bi_flags &= ~BCH_INODE_unlinked;
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+ return __bch2_fsck_write_inode(trans, target);
+ }
+
+ if (bch2_inode_should_have_single_bp(target) &&
+ !fsck_err(trans, inode_wrong_backpointer,
+ "dirent points to inode that does not point back:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n "),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf)))
+ goto err;
+
+ struct bkey_s_c_dirent bp_dirent =
+ bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents,
+ SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot),
+ 0, dirent);
+ ret = bkey_err(bp_dirent);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ bool backpointer_exists = !ret;
+ ret = 0;
+
+ if (!backpointer_exists) {
+ if (fsck_err(trans, inode_wrong_backpointer,
+ "inode %llu:%u has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ target->bi_inum, target->bi_snapshot,
+ target->bi_dir,
+ target->bi_dir_offset,
+ d.k->p.inode,
+ d.k->p.offset)) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, target);
+ }
+ } else {
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
+ if (S_ISDIR(target->bi_mode) || target->bi_subvol) {
+ /*
+ * XXX: verify connectivity of the other dirent
+ * up to the root before removing this one
+ *
+ * Additionally, bch2_lookup would need to cope with the
+ * dirent it found being removed - or should we remove
+ * the other one, even though the inode points to it?
+ */
+ if (in_fsck) {
+ if (fsck_err(trans, inode_dir_multiple_links,
+ "%s %llu:%u with multiple links\n%s",
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+ target->bi_inum, target->bi_snapshot, buf.buf))
+ ret = bch2_fsck_remove_dirent(trans, d.k->p);
+ } else {
+ bch2_fs_inconsistent(c,
+ "%s %llu:%u with multiple links\n%s",
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+ target->bi_inum, target->bi_snapshot, buf.buf);
+ }
+
+ goto out;
+ } else {
+ /*
+ * hardlinked file with nlink 0:
+ * We're just adjusting nlink here so check_nlinks() will pick
+ * it up, it ignores inodes with nlink 0
+ */
+ if (fsck_err_on(!target->bi_nlink,
+ trans, inode_multiple_links_but_nlink_0,
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+ target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
+ target->bi_nlink++;
+ target->bi_flags &= ~BCH_INODE_unlinked;
+ ret = __bch2_fsck_write_inode(trans, target);
+ if (ret)
+ goto err;
+ }
+ }
+ }
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int __bch2_check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *dirent_iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ bool in_fsck)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(d.v->d_type != inode_d_type(target),
+ trans, dirent_d_type_wrong,
+ "incorrect d_type: got %s, should be %s:\n%s",
+ bch2_d_type_str(d.v->d_type),
+ bch2_d_type_str(inode_d_type(target)),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&n->k_i, d.s_c);
+ n->v.d_type = inode_d_type(target);
+ if (n->v.d_type == DT_SUBVOL) {
+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+ n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
+ } else {
+ n->v.d_inum = cpu_to_le64(target->bi_inum);
+ }
+
+ ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
diff --git a/libbcachefs/fs-common.h b/libbcachefs/namei.h
index 2b59210b..2e6f6364 100644
--- a/libbcachefs/fs-common.h
+++ b/libbcachefs/namei.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_COMMON_H
-#define _BCACHEFS_FS_COMMON_H
+#ifndef _BCACHEFS_NAMEI_H
+#define _BCACHEFS_NAMEI_H
#include "dirent.h"
@@ -44,4 +44,29 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
-#endif /* _BCACHEFS_FS_COMMON_H */
+int __bch2_check_dirent_target(struct btree_trans *,
+ struct btree_iter *,
+ struct bkey_s_c_dirent,
+ struct bch_inode_unpacked *, bool);
+
+static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static inline int bch2_check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *dirent_iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ bool in_fsck)
+{
+ if (likely(inode_points_to_dirent(target, d) &&
+ d.v->d_type == inode_d_type(target)))
+ return 0;
+
+ return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck);
+}
+
+#endif /* _BCACHEFS_NAMEI_H */
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index 24a46103..e5c42e20 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -44,7 +44,7 @@ const char * const __bch2_btree_ids[] = {
NULL
};
-static const char * const __bch2_csum_types[] = {
+const char * const __bch2_csum_types[] = {
BCH_CSUM_TYPES()
NULL
};
@@ -219,10 +219,10 @@ typedef void (*sb_opt_set_fn)(struct bch_sb *, u64);
typedef u64 (*member_opt_get_fn)(const struct bch_member *);
typedef void (*member_opt_set_fn)(struct bch_member *, u64);
-static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL;
-static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL;
-static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL;
-static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL;
+__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL;
+__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL;
+__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL;
+__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL;
#define type_compatible_or_null(_p, _type) \
__builtin_choose_expr( \
@@ -551,14 +551,15 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
goto bad_opt;
ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
- if (ret == -BCH_ERR_option_needs_open_fs && parse_later) {
- prt_printf(parse_later, "%s=%s,", name, val);
- if (parse_later->allocation_failure) {
- ret = -ENOMEM;
- goto out;
+ if (ret == -BCH_ERR_option_needs_open_fs) {
+ ret = 0;
+
+ if (parse_later) {
+ prt_printf(parse_later, "%s=%s,", name, val);
+ if (parse_later->allocation_failure)
+ ret = -ENOMEM;
}
- ret = 0;
goto out;
}
@@ -569,28 +570,24 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
bch2_opt_set_by_id(opts, id, v);
ret = 0;
- goto out;
-
+out:
+ printbuf_exit(&err);
+ return ret;
bad_opt:
- pr_err("Bad mount option %s", name);
ret = -BCH_ERR_option_name;
goto out;
-
bad_val:
- pr_err("Invalid mount option %s", err.buf);
ret = -BCH_ERR_option_value;
-
-out:
- printbuf_exit(&err);
- return ret;
+ goto out;
}
int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
- struct printbuf *parse_later, char *options)
+ struct printbuf *parse_later, char *options,
+ bool ignore_unknown)
{
char *copied_opts, *copied_opts_start;
char *opt, *name, *val;
- int ret;
+ int ret = 0;
if (!options)
return 0;
@@ -615,14 +612,14 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
val = opt;
ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val);
- if (ret < 0)
- goto out;
+ if (ret == -BCH_ERR_option_name && ignore_unknown)
+ ret = 0;
+ if (ret) {
+ pr_err("Error parsing option %s: %s", name, bch2_err_str(ret));
+ break;
+ }
}
- ret = 0;
- goto out;
-
-out:
kfree(copied_opts_start);
return ret;
}
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 8d1dc881..4d063130 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -16,6 +16,7 @@ extern const char * const bch2_version_upgrade_opts[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
+extern const char * const __bch2_csum_types[];
extern const char * const __bch2_csum_opts[];
extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
@@ -499,11 +500,6 @@ enum fsck_err_opts {
OPT_STR(bch2_member_states), \
BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \
"state", "rw,ro,failed,spare") \
- x(fs_size, u64, \
- OPT_DEVICE|OPT_HIDDEN, \
- OPT_UINT(0, S64_MAX), \
- BCH2_NO_MEMBER_OPT, 0, \
- "size", "Size of filesystem on device") \
x(bucket_size, u32, \
OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \
OPT_UINT(0, S64_MAX), \
@@ -640,7 +636,7 @@ int bch2_opts_check_may_set(struct bch_fs *);
int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
struct printbuf *, const char *, const char *);
int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
- char *);
+ char *, bool);
/* inode opts: */
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index 29a56938..10c6a7fd 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -95,6 +95,9 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+ return 0;
+
return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
bch2_bkey_ptrs_need_move(c, opts, ptrs);
}
@@ -107,6 +110,9 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
if (!opts)
return 0;
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+ return 0;
+
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 sectors = 0;
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index a6e26733..266c5770 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -13,12 +13,12 @@
#include "disk_accounting.h"
#include "errcode.h"
#include "error.h"
-#include "fs-common.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "logged_ops.h"
#include "move.h"
+#include "namei.h"
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h
index fa27ec59..5c4e5de7 100644
--- a/libbcachefs/sb-counters_format.h
+++ b/libbcachefs/sb-counters_format.h
@@ -16,6 +16,7 @@ enum counters_flags {
x(io_read_split, 33, TYPE_COUNTER) \
x(io_read_reuse_race, 34, TYPE_COUNTER) \
x(io_read_retry, 32, TYPE_COUNTER) \
+ x(io_read_fail_and_poison, 82, TYPE_COUNTER) \
x(io_write, 1, TYPE_SECTORS) \
x(io_move, 2, TYPE_SECTORS) \
x(io_move_read, 35, TYPE_SECTORS) \
diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h
index 67455beb..1736abea 100644
--- a/libbcachefs/sb-errors_format.h
+++ b/libbcachefs/sb-errors_format.h
@@ -311,13 +311,14 @@ enum bch_fsck_flags {
x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
+ x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
x(dirent_cf_name_too_big, 304, 0) \
x(dirent_stray_data_after_cf_name, 305, 0) \
- x(MAX, 307, 0)
+ x(MAX, 308, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index d662adfb..99f9a0aa 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -1990,15 +1990,12 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
mutex_unlock(&c->sb_lock);
if (ca->mi.freespace_initialized) {
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
- .dev_data_type.dev = ca->dev_idx,
- .dev_data_type.data_type = BCH_DATA_free,
- };
u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
+ bch2_disk_accounting_mod2(trans, false, v, dev_data_type,
+ .dev = ca->dev_idx,
+ .data_type = BCH_DATA_free)) ?:
bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
if (ret)
goto err;
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 251ba822..8c200b55 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -148,6 +148,7 @@ write_attribute(trigger_btree_key_cache_shrink);
write_attribute(trigger_freelist_wakeup);
write_attribute(trigger_btree_updates);
read_attribute(gc_gens_pos);
+__sysfs_attribute(read_fua_test, 0400);
read_attribute(uuid);
read_attribute(minor);
@@ -310,6 +311,116 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes);
}
+static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+ struct bio *bio = NULL;
+ void *buf = NULL;
+ unsigned bs = c->opts.block_size, iters;
+ u64 end, test_duration = NSEC_PER_SEC * 2;
+ struct bch2_time_stats stats_nofua, stats_fua, stats_random;
+ int ret = 0;
+
+ bch2_time_stats_init_no_pcpu(&stats_nofua);
+ bch2_time_stats_init_no_pcpu(&stats_fua);
+ bch2_time_stats_init_no_pcpu(&stats_random);
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, READ)) {
+ prt_str(out, "offline\n");
+ return 0;
+ }
+
+ struct block_device *bdev = ca->disk_sb.bdev;
+
+ bio = bio_kmalloc(1, GFP_KERNEL);
+ if (!bio) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ buf = kmalloc(bs, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ end = ktime_get_ns() + test_duration;
+ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
+ bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ);
+ bch2_bio_map(bio, buf, bs);
+
+ u64 submit_time = ktime_get_ns();
+ ret = submit_bio_wait(bio);
+ bch2_time_stats_update(&stats_nofua, submit_time);
+
+ if (ret)
+ goto err;
+ }
+
+ end = ktime_get_ns() + test_duration;
+ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
+ bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ);
+ bch2_bio_map(bio, buf, bs);
+
+ u64 submit_time = ktime_get_ns();
+ ret = submit_bio_wait(bio);
+ bch2_time_stats_update(&stats_fua, submit_time);
+
+ if (ret)
+ goto err;
+ }
+
+ u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca);
+
+ end = ktime_get_ns() + test_duration;
+ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
+ bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ);
+ bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9;
+ bch2_bio_map(bio, buf, bs);
+
+ u64 submit_time = ktime_get_ns();
+ ret = submit_bio_wait(bio);
+ bch2_time_stats_update(&stats_random, submit_time);
+
+ if (ret)
+ goto err;
+ }
+
+ u64 ns_nofua = mean_and_variance_get_mean(stats_nofua.duration_stats);
+ u64 ns_fua = mean_and_variance_get_mean(stats_fua.duration_stats);
+ u64 ns_rand = mean_and_variance_get_mean(stats_random.duration_stats);
+
+ u64 stddev_nofua = mean_and_variance_get_stddev(stats_nofua.duration_stats);
+ u64 stddev_fua = mean_and_variance_get_stddev(stats_fua.duration_stats);
+ u64 stddev_rand = mean_and_variance_get_stddev(stats_random.duration_stats);
+
+ printbuf_tabstop_push(out, 8);
+ printbuf_tabstop_push(out, 12);
+ printbuf_tabstop_push(out, 12);
+ prt_printf(out, "This test must be run on an idle drive for accurate results\n");
+ prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device));
+ prt_printf(out, "fua support advertized: %s\n", bdev_fua(bdev) ? "yes" : "no");
+ prt_newline(out);
+ prt_printf(out, "ns:\tlatency\rstddev\r\n");
+ prt_printf(out, "nofua\t%llu\r%llu\r\n", ns_nofua, stddev_nofua);
+ prt_printf(out, "fua\t%llu\r%llu\r\n", ns_fua, stddev_fua);
+ prt_printf(out, "random\t%llu\r%llu\r\n", ns_rand, stddev_rand);
+
+ bool read_cache = ns_nofua * 2 < ns_rand;
+ bool fua_cached = read_cache && ns_fua < (ns_nofua + ns_rand) / 2;
+
+ if (!read_cache)
+ prt_str(out, "reads don't appear to be cached - safe\n");
+ else if (!fua_cached)
+ prt_str(out, "fua reads don't appear to be cached - safe\n");
+ else
+ prt_str(out, "fua reads appear to be cached - unsafe\n");
+err:
+ kfree(buf);
+ kfree(bio);
+ percpu_ref_put(&ca->io_ref);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
SHOW(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -823,6 +934,9 @@ SHOW(bch2_dev)
if (attr == &sysfs_open_buckets)
bch2_open_buckets_to_text(out, c, ca);
+ if (attr == &sysfs_read_fua_test)
+ return bch2_read_fua_test(out, ca);
+
int opt_id = bch2_opt_lookup(attr->name);
if (opt_id >= 0)
return sysfs_opt_show(c, ca, opt_id, out);
@@ -879,6 +993,8 @@ struct attribute *bch2_dev_files[] = {
&sysfs_io_latency_stats_write,
&sysfs_congested,
+ &sysfs_read_fua_test,
+
/* debug: */
&sysfs_alloc_debug,
&sysfs_open_buckets,
diff --git a/libbcachefs/time_stats.c b/libbcachefs/time_stats.c
index 3fe82757..a8382d87 100644
--- a/libbcachefs/time_stats.c
+++ b/libbcachefs/time_stats.c
@@ -10,6 +10,9 @@
#include "eytzinger.h"
#include "time_stats.h"
+/* disable automatic switching to percpu mode */
+#define TIME_STATS_NONPCPU ((struct time_stat_buffer *) 1)
+
static const struct time_unit time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
@@ -123,11 +126,12 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
- if (!stats->buffer) {
+ if ((unsigned long) stats->buffer <= 1) {
spin_lock_irqsave(&stats->lock, flags);
time_stats_update_one(stats, start, end);
- if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
+ if (!stats->buffer &&
+ mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
stats->duration_stats.n > 1024)
stats->buffer =
alloc_percpu_gfp(struct time_stat_buffer,
@@ -157,7 +161,8 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats)
unsigned offset = offsetof(struct bch2_time_stats, min_duration);
memset((void *) stats + offset, 0, sizeof(*stats) - offset);
- if (stats->buffer) {
+ if (stats->buffer &&
+ stats->buffer != TIME_STATS_NONPCPU) {
int cpu;
for_each_possible_cpu(cpu)
per_cpu_ptr(stats->buffer, cpu)->nr = 0;
@@ -167,7 +172,10 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats)
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
- free_percpu(stats->buffer);
+ if (stats->buffer != TIME_STATS_NONPCPU) {
+ free_percpu(stats->buffer);
+ stats->buffer = NULL;
+ }
}
void bch2_time_stats_init(struct bch2_time_stats *stats)
@@ -177,3 +185,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats)
stats->min_freq = U64_MAX;
spin_lock_init(&stats->lock);
}
+
+void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats)
+{
+ bch2_time_stats_init(stats);
+ stats->buffer = TIME_STATS_NONPCPU;
+}
diff --git a/libbcachefs/time_stats.h b/libbcachefs/time_stats.h
index dc6493f7..eddb0985 100644
--- a/libbcachefs/time_stats.h
+++ b/libbcachefs/time_stats.h
@@ -145,6 +145,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
void bch2_time_stats_reset(struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
void bch2_time_stats_init(struct bch2_time_stats *);
+void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *);
static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
{
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
index 519d00d6..8c07189a 100644
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@@ -339,6 +339,11 @@ DEFINE_EVENT(bio, io_read_reuse_race,
TP_ARGS(bio)
);
+DEFINE_EVENT(bio, io_read_fail_and_poison,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio)
+);
+
/* ec.c */
TRACE_EVENT(stripe_create,
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index d41e133a..7d921fc9 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -431,7 +431,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src,
static inline void __memcpy_u64s(void *dst, const void *src,
unsigned u64s)
{
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
long d0, d1, d2;
asm volatile("rep ; movsq"
@@ -508,7 +508,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src,
u64 *dst = (u64 *) _dst + u64s - 1;
u64 *src = (u64 *) _src + u64s - 1;
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
long d0, d1, d2;
asm volatile("std ;\n"