summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2025-03-16 16:08:41 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2025-03-17 14:23:49 -0400
commitc0836924b19ae84ad95d7ec97455c96f61b81201 (patch)
treee48afe4496a7e6ef8c7ec6a1d5d14064f69747ba
parentf42ee45c6e6409ad7c971aa37aef69b97d761006 (diff)
Update bcachefs sources to 4d28432bcc5f bcachefs: Validate bch_sb.offset field
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/crypto/sha2.h6
-rw-r--r--include/linux/math64.h67
-rw-r--r--include/linux/random.h17
-rw-r--r--libbcachefs/alloc_background.c2
-rw-r--r--libbcachefs/bcachefs.h1
-rw-r--r--libbcachefs/bcachefs_format.h2
-rw-r--r--libbcachefs/btree_cache.c1
-rw-r--r--libbcachefs/btree_io.c45
-rw-r--r--libbcachefs/btree_iter.c14
-rw-r--r--libbcachefs/btree_update.h8
-rw-r--r--libbcachefs/btree_write_buffer.c21
-rw-r--r--libbcachefs/buckets.h4
-rw-r--r--libbcachefs/checksum.c23
-rw-r--r--libbcachefs/checksum.h2
-rw-r--r--libbcachefs/data_update.c104
-rw-r--r--libbcachefs/ec.c16
-rw-r--r--libbcachefs/errcode.h17
-rw-r--r--libbcachefs/extents.c147
-rw-r--r--libbcachefs/extents.h5
-rw-r--r--libbcachefs/extents_types.h7
-rw-r--r--libbcachefs/fs-io-buffered.c3
-rw-r--r--libbcachefs/fs.c71
-rw-r--r--libbcachefs/inode.c14
-rw-r--r--libbcachefs/io_read.c249
-rw-r--r--libbcachefs/io_read.h20
-rw-r--r--libbcachefs/io_write.c38
-rw-r--r--libbcachefs/journal_io.c5
-rw-r--r--libbcachefs/move.c38
-rw-r--r--libbcachefs/movinggc.c11
-rw-r--r--libbcachefs/opts.h5
-rw-r--r--libbcachefs/rebalance.c42
-rw-r--r--libbcachefs/recovery.c2
-rw-r--r--libbcachefs/sb-counters_format.h1
-rw-r--r--libbcachefs/str_hash.h8
-rw-r--r--libbcachefs/super-io.c40
-rw-r--r--libbcachefs/super-io.h2
-rw-r--r--libbcachefs/super.c14
-rw-r--r--libbcachefs/sysfs.c70
-rw-r--r--libbcachefs/trace.h5
-rw-r--r--libbcachefs/util.c36
-rw-r--r--libbcachefs/util.h14
42 files changed, 690 insertions, 509 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 7d7555ff..e778bec6 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-46af7258b951a79a66511172ab8772ad2dfaa4e3
+4d28432bcc5f91caf053f64a1cde1a6286adf4a6
diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index 8a46202b..b6183bd0 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -7,6 +7,7 @@
#define _CRYPTO_SHA_H
#include <linux/types.h>
+#include <sodium/crypto_hash_sha256.h>
#define SHA1_DIGEST_SIZE 20
#define SHA1_BLOCK_SIZE 64
@@ -112,4 +113,9 @@ extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *hash);
+
+static inline void sha256(const u8 *data, unsigned int len, u8 *out)
+{
+ crypto_hash_sha256(out, data, len);
+}
#endif
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 5eb6f064..13efcc08 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -82,4 +82,71 @@ static inline s64 div_s64(s64 dividend, s32 divisor)
return div_s64_rem(dividend, divisor, &remainder);
}
+#ifndef mul_u32_u32
+/*
+ * Many a GCC version messes this up and generates a 64x64 mult :-(
+ */
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+ return (u64)a * b;
+}
+#endif
+
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+
+#ifndef mul_u64_u64_shr
+static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
+{
+ return (u64)(((unsigned __int128)a * mul) >> shift);
+}
+#endif /* mul_u64_u64_shr */
+
+#else
+
+#ifndef mul_u64_u64_shr
+static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift)
+{
+ union {
+ u64 ll;
+ struct {
+#ifdef __BIG_ENDIAN
+ u32 high, low;
+#else
+ u32 low, high;
+#endif
+ } l;
+ } rl, rm, rn, rh, a0, b0;
+ u64 c;
+
+ a0.ll = a;
+ b0.ll = b;
+
+ rl.ll = mul_u32_u32(a0.l.low, b0.l.low);
+ rm.ll = mul_u32_u32(a0.l.low, b0.l.high);
+ rn.ll = mul_u32_u32(a0.l.high, b0.l.low);
+ rh.ll = mul_u32_u32(a0.l.high, b0.l.high);
+
+ /*
+ * Each of these lines computes a 64-bit intermediate result into "c",
+ * starting at bits 32-95. The low 32-bits go into the result of the
+ * multiplication, the high 32-bits are carried into the next step.
+ */
+ rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low;
+ rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low;
+ rh.l.high = (c >> 32) + rh.l.high;
+
+ /*
+ * The 128-bit result of the multiplication is in rl.ll and rh.ll,
+ * shift it right and throw away the high part of the result.
+ */
+ if (shift == 0)
+ return rl.ll;
+ if (shift < 64)
+ return (rl.ll >> shift) | (rh.ll << (64 - shift));
+ return rh.ll >> (shift & 63);
+}
+#endif /* mul_u64_u64_shr */
+
+#endif
+
#endif /* _LINUX_MATH64_H */
diff --git a/include/linux/random.h b/include/linux/random.h
index 3203d13c..9b2bb59a 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -9,7 +9,9 @@
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/bug.h>
+#include <linux/kernel.h>
#include <linux/log2.h>
+#include <linux/math64.h>
#ifdef SYS_getrandom
static inline int getrandom(void *buf, size_t buflen, unsigned int flags)
@@ -67,4 +69,19 @@ static inline u32 get_random_u32_below(u32 ceil)
}
}
+static inline u64 get_random_u64_below(u64 ceil)
+{
+ if (ceil <= 1)
+ return 0;
+ if (ceil <= U32_MAX)
+ return get_random_u32_below(ceil);
+
+ for (;;) {
+ u64 rand = get_random_u64();
+ u64 mult = ceil * rand;
+ if (likely(mult >= -ceil % ceil))
+ return mul_u64_u64_shr(ceil, rand, 64);
+ }
+}
+
#endif /* _LINUX_RANDOM_H */
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index ecad4a78..4dfcf3e6 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -232,7 +232,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
int ret = 0;
bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
- c, alloc_v2_unpack_error,
+ c, alloc_v3_unpack_error,
"unpack error");
fsck_err:
return ret;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index b432bb6e..0ea593e8 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -979,7 +979,6 @@ struct bch_fs {
mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
size_t zstd_workspace_size;
- struct crypto_shash *sha256;
struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 7a5b0d21..e96d8776 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -842,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+/* one free bit */
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
@@ -861,6 +862,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
struct bch_sb, flags[5], 48, 64);
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
+LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 1ec1f90e..54666027 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
btree_node_write_in_flight(b));
btree_node_data_free(bc, b);
+ cond_resched();
}
BUG_ON(!bch2_journal_error(&c->journal) &&
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 6638bb1f..6abc9f17 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -2080,11 +2080,6 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
- unsigned commit_flags =
- BCH_WATERMARK_interior_updates|
- BCH_TRANS_COMMIT_journal_reclaim|
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_check_rw;
u64 start_time = wbio->start_time;
int ret = 0;
@@ -2093,24 +2088,38 @@ static void btree_node_write_work(struct work_struct *work)
wbio->wbio.used_mempool,
wbio->data);
- if (wbio->wbio.failed.nr) {
- ret = bch2_trans_do(c,
- bch2_btree_node_rewrite_key_get_iter(trans, b,
- commit_flags));
- } else if (!wbio->wbio.first_btree_write) {
- ret = bch2_trans_do(c,
- bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
- commit_flags, true));
- }
+ bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
+ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
- if (ret) {
- set_btree_node_noevict(b);
- bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
- "writing btree node: %s", bch2_err_str(ret));
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
+ ret = -BCH_ERR_btree_node_write_all_failed;
+ goto err;
}
+ if (wbio->wbio.first_btree_write) {
+ if (wbio->wbio.failed.nr) {
+
+ }
+ } else {
+ ret = bch2_trans_do(c,
+ bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
+ BCH_WATERMARK_interior_updates|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw,
+ !wbio->wbio.failed.nr));
+ if (ret)
+ goto err;
+ }
+out:
bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b, start_time);
+ return;
+err:
+ set_btree_node_noevict(b);
+ bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
+ "writing btree node: %s", bch2_err_str(ret));
+ goto out;
}
static void btree_node_write_endio(struct bio *bio)
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index e32fce4f..7542c6f9 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
bch2_btree_node_iter_peek_all(&l->iter, l->b));
}
-static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_path_level *l,
- struct bkey *u)
-{
- struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
- bch2_btree_node_iter_peek(&l->iter, l->b));
-
- path->pos = k.k ? k.k->p : l->b->key.k.p;
- trans->paths_sorted = false;
- bch2_btree_path_verify_level(trans, path, l - path->l);
- return k;
-}
-
static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
struct btree_path *path,
struct btree_path_level *l,
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 8f22ef9a..47d8690f 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -126,10 +126,18 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
+int bch2_btree_write_buffer_insert_err(struct btree_trans *,
+ enum btree_id, struct bkey_i *);
+
static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
enum btree_id btree,
struct bkey_i *k)
{
+ if (unlikely(!btree_type_uses_write_buffer(btree))) {
+ int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
+ dump_stack();
+ return ret;
+ }
/*
* Most updates skip the btree write buffer until journal replay is
* finished because synchronization with journal replay relies on having
diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c
index b56c4987..2c09d19d 100644
--- a/libbcachefs/btree_write_buffer.c
+++ b/libbcachefs/btree_write_buffer.c
@@ -264,6 +264,22 @@ out:
BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
}
+int bch2_btree_write_buffer_insert_err(struct btree_trans *trans,
+ enum btree_id btree, struct bkey_i *k)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
+ bch2_btree_id_to_text(&buf, btree);
+ prt_str(&buf, "\n");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -EROFS;
+}
+
static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
@@ -312,7 +328,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
darray_for_each(wb->sorted, i) {
struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
- BUG_ON(!btree_type_uses_write_buffer(k->btree));
+ if (unlikely(!btree_type_uses_write_buffer(k->btree))) {
+ ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k);
+ goto err;
+ }
for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
prefetch(&wb->flushing.keys.data[n->idx]);
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 6aeec1c0..c5363256 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -140,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b)
static inline int gen_after(u8 a, u8 b)
{
- int r = gen_cmp(a, b);
-
- return r > 0 ? r : 0;
+ return max(0, gen_cmp(a, b));
}
static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 23a38357..7f9e4c59 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -693,6 +693,14 @@ static int bch2_alloc_ciphers(struct bch_fs *c)
return 0;
}
+#if 0
+
+/*
+ * This seems to be duplicating code in cmd_remove_passphrase() in
+ * bcachefs-tools, but we might want to switch userspace to use this - and
+ * perhaps add an ioctl for calling this at runtime, so we can take the
+ * passphrase off of a mounted filesystem (which has come up).
+ */
int bch2_disable_encryption(struct bch_fs *c)
{
struct bch_sb_field_crypt *crypt;
@@ -725,6 +733,10 @@ out:
return ret;
}
+/*
+ * For enabling encryption on an existing filesystem: not hooked up yet, but it
+ * should be
+ */
int bch2_enable_encryption(struct bch_fs *c, bool keyed)
{
struct bch_encrypted_key key;
@@ -781,6 +793,7 @@ err:
memzero_explicit(&key, sizeof(key));
return ret;
}
+#endif
void bch2_fs_encryption_exit(struct bch_fs *c)
{
@@ -788,8 +801,6 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
crypto_free_shash(c->poly1305);
if (c->chacha20)
crypto_free_sync_skcipher(c->chacha20);
- if (c->sha256)
- crypto_free_shash(c->sha256);
}
int bch2_fs_encryption_init(struct bch_fs *c)
@@ -798,14 +809,6 @@ int bch2_fs_encryption_init(struct bch_fs *c)
struct bch_key key;
int ret = 0;
- c->sha256 = crypto_alloc_shash("sha256", 0, 0);
- ret = PTR_ERR_OR_ZERO(c->sha256);
- if (ret) {
- c->sha256 = NULL;
- bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
- goto out;
- }
-
crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
if (!crypt)
goto out;
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h
index 43b9d71f..4ac251c8 100644
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -103,8 +103,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
struct bch_key *);
+#if 0
int bch2_disable_encryption(struct bch_fs *);
int bch2_enable_encryption(struct bch_fs *, bool);
+#endif
void bch2_fs_encryption_exit(struct bch_fs *);
int bch2_fs_encryption_init(struct bch_fs *);
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 522574bc..08bb7f30 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -638,40 +638,6 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
-static bool can_allocate_without_blocking(struct bch_fs *c,
- struct data_update *m)
-{
- if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
- return false;
-
- unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
- ? m->op.target
- : 0;
- struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
-
- darray_for_each(m->op.devs_have, i)
- __clear_bit(*i, devs.d);
-
- rcu_read_lock();
- unsigned nr_replicas = 0, i;
- for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
- struct bch_dev *ca = bch2_dev_rcu(c, i);
-
- struct bch_dev_usage usage;
- bch2_dev_usage_read_fast(ca, &usage);
-
- if (!dev_buckets_free(ca, usage, m->op.watermark))
- continue;
-
- nr_replicas += ca->mi.durability;
- if (nr_replicas >= m->op.nr_replicas)
- break;
- }
- rcu_read_unlock();
-
- return nr_replicas >= m->op.nr_replicas;
-}
-
int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
struct bch_io_opts *io_opts)
{
@@ -700,22 +666,49 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
}
rbio_init(&m->rbio.bio, c, *io_opts, NULL);
+ m->rbio.data_update = true;
m->rbio.bio.bi_iter.bi_size = buf_bytes;
m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
return 0;
}
-static bool can_write_extent(struct bch_fs *c,
- struct bch_devs_list *devs_have,
- unsigned target)
+static int can_write_extent(struct bch_fs *c, struct data_update *m)
{
+ if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
+ unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
+ return -BCH_ERR_data_update_done_would_block;
+
+ unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
+ ? m->op.target
+ : 0;
struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
- darray_for_each(*devs_have, i)
+ darray_for_each(m->op.devs_have, i)
__clear_bit(*i, devs.d);
- return !bch2_is_zero(&devs, sizeof(devs));
+ rcu_read_lock();
+ unsigned nr_replicas = 0, i;
+ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
+ struct bch_dev *ca = bch2_dev_rcu(c, i);
+
+ struct bch_dev_usage usage;
+ bch2_dev_usage_read_fast(ca, &usage);
+
+ if (!dev_buckets_free(ca, usage, m->op.watermark))
+ continue;
+
+ nr_replicas += ca->mi.durability;
+ if (nr_replicas >= m->op.nr_replicas)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!nr_replicas)
+ return -BCH_ERR_data_update_done_no_rw_devs;
+ if (nr_replicas < m->op.nr_replicas)
+ return -BCH_ERR_insufficient_devices;
+ return 0;
}
int bch2_data_update_init(struct btree_trans *trans,
@@ -799,20 +792,6 @@ int bch2_data_update_init(struct btree_trans *trans,
ptr_bit <<= 1;
}
- if (!can_write_extent(c, &m->op.devs_have,
- m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) {
- /*
- * Check if we have rw devices not in devs_have: this can happen
- * if we're trying to move data on a ro or failed device
- *
- * If we can't move it, we need to clear the rebalance_work bit,
- * if applicable
- *
- * Also, copygc should skip ro/failed devices:
- */
- return -BCH_ERR_data_update_done_no_rw_devs;
- }
-
unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
/*
@@ -852,11 +831,22 @@ int bch2_data_update_init(struct btree_trans *trans,
goto out_bkey_buf_exit;
}
- if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
- !can_allocate_without_blocking(c, m)) {
- ret = -BCH_ERR_data_update_done_would_block;
+ /*
+ * Check if the allocation will succeed, to avoid getting an error later
+ * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
+ * read:
+ *
+ * This guards against
+ * - BCH_WRITE_alloc_nowait allocations failing (promotes)
+ * - Destination target full
+ * - Device(s) in destination target offline
+ * - Insufficient durability available in destination target
+ * (i.e. trying to move a durability=2 replica to a target with a
+ * single durability=2 device)
+ */
+ ret = can_write_extent(c, m);
+ if (ret)
goto out_bkey_buf_exit;
- }
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 865cc53a..c73ba73f 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -380,19 +380,6 @@ static int mark_stripe_buckets(struct btree_trans *trans,
return 0;
}
-static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
-{
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->disk_label = s->disk_label;
- m->blocks_nonempty = 0;
-
- for (unsigned i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
-}
-
int bch2_trigger_stripe(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new,
@@ -1320,6 +1307,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (s->err) {
if (!bch2_err_matches(s->err, EROFS))
bch_err(c, "error creating stripe: error writing data buckets");
+ ret = s->err;
goto err;
}
@@ -1328,6 +1316,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_do_recov(c, &s->existing_stripe)) {
bch_err(c, "error creating stripe: error reading existing stripe");
+ ret = -BCH_ERR_ec_block_read;
goto err;
}
@@ -1353,6 +1342,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_nr_failed(&s->new_stripe)) {
bch_err(c, "error creating stripe: error writing redundancy buckets");
+ ret = -BCH_ERR_ec_block_write;
goto err;
}
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index 531fe575..cb27de6f 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -231,6 +231,7 @@
x(BCH_ERR_invalid_sb, invalid_sb_csum) \
x(BCH_ERR_invalid_sb, invalid_sb_block_size) \
x(BCH_ERR_invalid_sb, invalid_sb_uuid) \
+ x(BCH_ERR_invalid_sb, invalid_sb_offset) \
x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \
x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \
x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \
@@ -273,21 +274,25 @@
x(EIO, stripe_reconstruct) \
x(EIO, key_type_error) \
x(EIO, extent_poisened) \
- x(EIO, no_device_to_read_from) \
x(EIO, missing_indirect_extent) \
x(EIO, invalidate_stripe_to_dev) \
x(EIO, no_encryption_key) \
x(EIO, insufficient_journal_devices) \
x(EIO, device_offline) \
x(EIO, EIO_fault_injected) \
+ x(EIO, ec_block_read) \
+ x(EIO, ec_block_write) \
x(EIO, data_read) \
+ x(BCH_ERR_data_read, no_device_to_read_from) \
+ x(BCH_ERR_data_read, data_read_io_err) \
+ x(BCH_ERR_data_read, data_read_csum_err) \
x(BCH_ERR_data_read, data_read_retry) \
x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
- x(BCH_ERR_data_read_retry_avoid,data_read_device_offline) \
- x(BCH_ERR_data_read_retry_avoid,data_read_io_err) \
- x(BCH_ERR_data_read_retry_avoid,data_read_ec_reconstruct_err) \
- x(BCH_ERR_data_read_retry_avoid,data_read_csum_err) \
- x(BCH_ERR_data_read_retry, data_read_csum_err_maybe_userspace) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \
+ x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\
x(BCH_ERR_data_read, data_read_decompress_err) \
x(BCH_ERR_data_read, data_read_decrypt_err) \
x(BCH_ERR_data_read, data_read_ptr_stale_race) \
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index f62ee96b..1da754a8 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -28,6 +28,8 @@
#include "trace.h"
#include "util.h"
+#include <linux/random.h>
+
static const char * const bch2_extent_flags_strs[] = {
#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
BCH_EXTENT_FLAGS()
@@ -94,38 +96,30 @@ static inline int dev_failed(struct bch_dev *ca)
*/
static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p1,
- const struct extent_ptr_decoded p2)
+ u64 p1_latency,
+ struct bch_dev *ca1,
+ const struct extent_ptr_decoded p2,
+ u64 p2_latency)
{
- if (likely(!p1.do_ec_reconstruct &&
- !p2.do_ec_reconstruct)) {
- struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev);
- struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
-
- int failed_delta = dev_failed(ca1) - dev_failed(ca2);
-
- if (failed_delta)
- return failed_delta < 0;
+ struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
- u64 l1 = dev_latency(ca1);
- u64 l2 = dev_latency(ca2);
+ int failed_delta = dev_failed(ca1) - dev_failed(ca2);
+ if (unlikely(failed_delta))
+ return failed_delta < 0;
- /*
- * Square the latencies, to bias more in favor of the faster
- * device - we never want to stop issuing reads to the slower
- * device altogether, so that we can update our latency numbers:
- */
- l1 *= l1;
- l2 *= l2;
+ if (unlikely(bch2_force_reconstruct_read))
+ return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
- /* Pick at random, biased in favor of the faster device: */
+ if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
+ return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
- return bch2_rand_range(l1 + l2) > l1;
- }
+ int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
+ if (unlikely(crc_retry_delta))
+ return crc_retry_delta < 0;
- if (bch2_force_reconstruct_read)
- return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
+ /* Pick at random, biased in favor of the faster device: */
- return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
+ return get_random_u64_below(p1_latency + p2_latency) > p1_latency;
}
/*
@@ -138,86 +132,105 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct extent_ptr_decoded *pick,
int dev)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- struct bch_dev_io_failures *f;
- unsigned csum_retry = 0;
- bool have_csum_retries = false;
- int ret = 0;
+ bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
+ bool have_dirty_ptrs = false, have_pick = false;
if (k.k->type == KEY_TYPE_error)
return -BCH_ERR_key_type_error;
- if (bch2_bkey_extent_ptrs_flags(ptrs) & BCH_EXTENT_FLAG_poisoned)
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
return -BCH_ERR_extent_poisened;
-again:
+
rcu_read_lock();
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ u64 pick_latency;
+
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ have_dirty_ptrs |= !p.ptr.cached;
+
/*
* Unwritten extent: no need to actually read, treat it as a
* hole and return 0s:
*/
if (p.ptr.unwritten) {
- ret = 0;
- break;
+ rcu_read_unlock();
+ return 0;
}
/* Are we being asked to read from a specific device? */
if (dev >= 0 && p.ptr.dev != dev)
continue;
- /*
- * If there are any dirty pointers it's an error if we can't
- * read:
- */
- if (!ret && !p.ptr.cached)
- ret = -BCH_ERR_no_device_to_read_from;
-
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
- if (unlikely(failed) &&
- (f = bch2_dev_io_failures(failed, p.ptr.dev))) {
- have_csum_retries |= !f->failed_io && f->failed_csum_nr < BCH_MAX_CSUM_RETRIES;
+ struct bch_dev_io_failures *f =
+ unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
+ if (unlikely(f)) {
+ p.crc_retry_nr = f->failed_csum_nr;
+ p.has_ec &= ~f->failed_ec;
- if (p.has_ec &&
- !f->failed_ec &&
- (f->failed_io || f->failed_csum_nr))
+ if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
+ have_io_errors |= f->failed_io;
+ have_io_errors |= f->failed_ec;
+ }
+ have_csum_errors |= !!f->failed_csum_nr;
+
+ if (p.has_ec && (f->failed_io || f->failed_csum_nr))
p.do_ec_reconstruct = true;
else if (f->failed_io ||
- f->failed_csum_nr > csum_retry)
+ f->failed_csum_nr > c->opts.checksum_err_retry_nr)
continue;
}
+ have_missing_devs |= ca && !bch2_dev_is_online(ca);
+
if (!ca || !bch2_dev_is_online(ca)) {
- if (p.has_ec)
- p.do_ec_reconstruct = true;
- else
+ if (!p.has_ec)
continue;
+ p.do_ec_reconstruct = true;
}
- if (p.has_ec && bch2_force_reconstruct_read)
+ if (bch2_force_reconstruct_read && p.has_ec)
p.do_ec_reconstruct = true;
- if (ret > 0 && !ptr_better(c, p, *pick))
- continue;
-
- *pick = p;
- ret = 1;
+ u64 p_latency = dev_latency(ca);
+ /*
+ * Square the latencies, to bias more in favor of the faster
+ * device - we never want to stop issuing reads to the slower
+ * device altogether, so that we can update our latency numbers:
+ */
+ p_latency *= p_latency;
+
+ if (!have_pick ||
+ ptr_better(c,
+ p, p_latency, ca,
+ *pick, pick_latency)) {
+ *pick = p;
+ pick_latency = p_latency;
+ have_pick = true;
+ }
}
rcu_read_unlock();
- if (unlikely(ret == -BCH_ERR_no_device_to_read_from &&
- have_csum_retries &&
- csum_retry < BCH_MAX_CSUM_RETRIES)) {
- csum_retry++;
- goto again;
- }
-
- return ret;
+ if (have_pick)
+ return 1;
+ if (!have_dirty_ptrs)
+ return 0;
+ if (have_missing_devs)
+ return -BCH_ERR_no_device_to_read_from;
+ if (have_csum_errors)
+ return -BCH_ERR_data_read_csum_err;
+ if (have_io_errors)
+ return -BCH_ERR_data_read_io_err;
+
+ WARN_ONCE(1, "unhandled error case in %s\n", __func__);
+ return -EINVAL;
}
/* KEY_TYPE_btree_ptr: */
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index b4058502..e78a39e7 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
({ \
__label__ out; \
\
- (_ptr).has_ec = false; \
- (_ptr).do_ec_reconstruct = false; \
+ (_ptr).has_ec = false; \
+ (_ptr).do_ec_reconstruct = false; \
+ (_ptr).crc_retry_nr = 0; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
switch (__extent_entry_type(_entry)) { \
diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h
index f8b8e598..e51529dc 100644
--- a/libbcachefs/extents_types.h
+++ b/libbcachefs/extents_types.h
@@ -21,19 +21,18 @@ struct bch_extent_crc_unpacked {
struct extent_ptr_decoded {
bool has_ec;
- unsigned do_ec_reconstruct;
+ bool do_ec_reconstruct;
+ u8 crc_retry_nr;
struct bch_extent_crc_unpacked crc;
struct bch_extent_ptr ptr;
struct bch_extent_stripe_ptr ec;
};
-#define BCH_MAX_CSUM_RETRIES 3
-
struct bch_io_failures {
u8 nr;
struct bch_dev_io_failures {
u8 dev;
- unsigned failed_csum_nr:4,
+ unsigned failed_csum_nr:6,
failed_io:1,
failed_ec:1;
} devs[BCH_REPLICAS_MAX + 1];
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
index 881b3051..5ab1c73c 100644
--- a/libbcachefs/fs-io-buffered.c
+++ b/libbcachefs/fs-io-buffered.c
@@ -117,6 +117,9 @@ static int readpage_bio_extend(struct btree_trans *trans,
unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
+ /* ensure proper alignment */
+ order = min(order, __ffs(folio_offset|BIT(31)));
+
folio = xa_load(&iter->mapping->i_pages, folio_offset);
if (folio && !xa_is_value(folio))
break;
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 459ca825..17ac9c55 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -2026,44 +2026,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
return c ?: ERR_PTR(-ENOENT);
}
-static int bch2_remount(struct super_block *sb, int *flags,
- struct bch_opts opts)
-{
- struct bch_fs *c = sb->s_fs_info;
- int ret = 0;
-
- opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
-
- if (opts.read_only != c->opts.read_only) {
- down_write(&c->state_lock);
-
- if (opts.read_only) {
- bch2_fs_read_only(c);
-
- sb->s_flags |= SB_RDONLY;
- } else {
- ret = bch2_fs_read_write(c);
- if (ret) {
- bch_err(c, "error going rw: %i", ret);
- up_write(&c->state_lock);
- ret = -EINVAL;
- goto err;
- }
-
- sb->s_flags &= ~SB_RDONLY;
- }
-
- c->opts.read_only = opts.read_only;
-
- up_write(&c->state_lock);
- }
-
- if (opt_defined(opts, errors))
- c->opts.errors = opts.errors;
-err:
- return bch2_err_class(ret);
-}
-
static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
@@ -2374,8 +2336,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
struct bch2_opts_parse *opts = fc->fs_private;
+ struct bch_fs *c = sb->s_fs_info;
+ int ret = 0;
+
+ opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
+
+ if (opts->opts.read_only != c->opts.read_only) {
+ down_write(&c->state_lock);
+
+ if (opts->opts.read_only) {
+ bch2_fs_read_only(c);
+
+ sb->s_flags |= SB_RDONLY;
+ } else {
+ ret = bch2_fs_read_write(c);
+ if (ret) {
+ bch_err(c, "error going rw: %i", ret);
+ up_write(&c->state_lock);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ sb->s_flags &= ~SB_RDONLY;
+ }
+
+ c->opts.read_only = opts->opts.read_only;
- return bch2_remount(sb, &fc->sb_flags, opts->opts);
+ up_write(&c->state_lock);
+ }
+
+ if (opt_defined(opts->opts, errors))
+ c->opts.errors = opts->opts.errors;
+err:
+ return bch2_err_class(ret);
}
static const struct fs_context_operations bch2_context_ops = {
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 04ec0520..7aca010e 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -868,19 +868,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
uid, gid, mode, rdev, parent);
}
-static inline u32 bkey_generation(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode:
- case KEY_TYPE_inode_v2:
- BUG();
- case KEY_TYPE_inode_generation:
- return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
- default:
- return 0;
- }
-}
-
static struct bkey_i_inode_alloc_cursor *
bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
{
@@ -1198,6 +1185,7 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
opts->_name##_from_inode = true; \
} else { \
opts->_name = c->opts._name; \
+ opts->_name##_from_inode = false; \
}
BCH_INODE_OPTS()
#undef x
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index 652dbc58..4fb279f1 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -25,8 +25,15 @@
#include "subvolume.h"
#include "trace.h"
+#include <linux/random.h>
#include <linux/sched/mm.h>
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_read_corrupt_ratio;
+module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(read_corrupt_ratio, "");
+#endif
+
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static bool bch2_target_congested(struct bch_fs *c, u16 target)
@@ -59,7 +66,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
}
rcu_read_unlock();
- return bch2_rand_range(nr * CONGESTED_MAX) < total;
+ return get_random_u32_below(nr * CONGESTED_MAX) < total;
}
#else
@@ -97,14 +104,21 @@ static inline bool have_io_error(struct bch_io_failures *failed)
return failed && failed->nr;
}
-static bool ptr_being_rewritten(struct bch_read_bio *orig,
- unsigned dev,
- unsigned flags)
+static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
{
- if (!(flags & BCH_READ_data_update))
+ EBUG_ON(rbio->split);
+
+ return rbio->data_update
+ ? container_of(rbio, struct data_update, rbio)
+ : NULL;
+}
+
+static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
+{
+ struct data_update *u = rbio_data_update(orig);
+ if (!u)
return false;
- struct data_update *u = container_of(orig, struct data_update, rbio);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
@@ -193,7 +207,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
struct bpos pos,
struct extent_ptr_decoded *pick,
unsigned sectors,
- unsigned flags,
struct bch_read_bio *orig,
struct bch_io_failures *failed)
{
@@ -214,7 +227,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
if (bch2_dev_io_failures(failed, ptr->dev) &&
- !ptr_being_rewritten(orig, ptr->dev, flags))
+ !ptr_being_rewritten(orig, ptr->dev))
update_opts.rewrite_ptrs |= ptr_bit;
ptr_bit <<= 1;
}
@@ -308,7 +321,7 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
k.k->type == KEY_TYPE_reflink_v
? BTREE_ID_reflink
: BTREE_ID_extents,
- k, pos, pick, sectors, flags, orig, failed);
+ k, pos, pick, sectors, orig, failed);
if (!promote)
return NULL;
@@ -336,7 +349,7 @@ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *o
if (ret)
return ret;
- if (rbio->flags & BCH_READ_data_update)
+ if (rbio->data_update)
prt_str(out, "(internal move) ");
return 0;
@@ -416,83 +429,6 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
bio_endio(&rbio->bio);
}
-static struct bkey_s_c get_rbio_extent(struct btree_trans *trans,
- struct bch_read_bio *rbio,
- struct btree_iter *iter)
-{
- if (rbio->flags & BCH_READ_data_update) {
- struct data_update *u = container_of(rbio, struct data_update, rbio);
-
- return bch2_bkey_get_iter(trans, iter,
- u->btree_id, bkey_start_pos(&u->k.k->k), 0);
- } else {
- struct bpos pos = rbio->read_pos;
- int ret = bch2_subvolume_get_snapshot(trans, rbio->subvol, &pos.snapshot);
- if (ret)
- return bkey_s_c_err(ret);
-
- return bch2_bkey_get_iter(trans, iter,
- BTREE_ID_extents, pos, 0);
- }
-}
-
-static void mark_io_failure_if_current_extent_matches(struct btree_trans *trans,
- struct bch_read_bio *rbio,
- struct bch_io_failures *failed)
-{
- struct btree_iter iter = {};
- struct bkey_s_c k;
- int ret = lockrestart_do(trans,
- bkey_err(k = get_rbio_extent(trans, rbio, &iter)));
-
- if (!ret) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr)
- if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr))
- bch2_mark_io_failure(failed, &rbio->pick,
- rbio->ret == -BCH_ERR_data_read_csum_err);
- }
-
- bch2_trans_iter_exit(trans, &iter);
-}
-
-static noinline int maybe_poison_extent(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k, struct bch_io_failures *failed)
-{
- u64 flags = bch2_bkey_extent_flags(k);
- if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return 0;
-
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- /*
- * Make sure we actually attempt to read and got checksum failures from
- * every replica
- */
-
- rcu_read_lock();
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
- continue;
-
- struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, ptr->dev);
- if (!f || f->failed_csum_nr != BCH_MAX_CSUM_RETRIES) {
- rcu_read_unlock();
- return 0;
- }
- }
- rcu_read_unlock();
-
- struct bkey_i *new = __bch2_bkey_make_mut(trans, iter, &k, 0, 0,
- bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
- return PTR_ERR_OR_ZERO(new) ?:
- bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
-}
-
static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
struct bch_read_bio *rbio,
struct bvec_iter bvec_iter,
@@ -530,9 +466,6 @@ err:
goto retry;
if (ret) {
- if (ret == -BCH_ERR_no_device_to_read_from && failed)
- maybe_poison_extent(trans, &iter, k, failed);
-
rbio->bio.bi_status = BLK_STS_IOERR;
rbio->ret = ret;
}
@@ -560,7 +493,8 @@ static void bch2_rbio_retry(struct work_struct *work)
bvec_iter_sectors(rbio->bvec_iter));
if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
- mark_io_failure_if_current_extent_matches(trans, rbio, &failed);
+ bch2_mark_io_failure(&failed, &rbio->pick,
+ rbio->ret == -BCH_ERR_data_read_retry_csum_err);
if (!rbio->split) {
rbio->bio.bi_status = 0;
@@ -577,7 +511,7 @@ static void bch2_rbio_retry(struct work_struct *work)
flags &= ~BCH_READ_last_fragment;
flags |= BCH_READ_must_clone;
- int ret = flags & BCH_READ_data_update
+ int ret = rbio->data_update
? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
: __bch2_read(trans, rbio, iter, inum, &failed, flags);
@@ -591,7 +525,7 @@ static void bch2_rbio_retry(struct work_struct *work)
bch2_inum_offset_err_msg_trans(trans, &buf,
(subvol_inum) { subvol, read_pos.inode },
read_pos.offset << 9));
- if (rbio->flags & BCH_READ_data_update)
+ if (rbio->data_update)
prt_str(&buf, "(internal move) ");
prt_str(&buf, "successful retry");
@@ -647,7 +581,7 @@ static void bch2_read_io_err(struct work_struct *work)
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
- bch2_rbio_error(rbio, -BCH_ERR_data_read_io_err, bio->bi_status);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
}
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
@@ -734,7 +668,7 @@ static void bch2_read_csum_err(struct work_struct *work)
else
bch_err_ratelimited(c, "%s", buf.buf);
- bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err, BLK_STS_IOERR);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
@@ -778,42 +712,6 @@ static void bch2_read_decrypt_err(struct work_struct *work)
printbuf_exit(&buf);
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-static unsigned bch2_read_corrupt_ratio;
-module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
-MODULE_PARM_DESC(read_corrupt_ratio, "");
-
-static void corrupt_bio(struct bio *bio)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
- unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
-
- bio_for_each_segment(bv, bio, iter) {
- unsigned u64s = bv.bv_len / sizeof(u64);
-
- if (offset < u64s) {
- u64 *segment = bvec_kmap_local(&bv);
- segment[offset] = get_random_u64();
- kunmap_local(segment);
- return;
- }
- offset -= u64s;
- }
-}
-
-static inline void maybe_corrupt_bio(struct bio *bio)
-{
- if (bch2_read_corrupt_ratio &&
- !get_random_u32_below(bch2_read_corrupt_ratio))
- corrupt_bio(bio);
-}
-#else
-static inline void maybe_corrupt_bio(struct bio *bio)
-{
-}
-#endif
-
/* Inner part that may run in process context */
static void __bch2_read_endio(struct work_struct *work)
{
@@ -821,9 +719,10 @@ static void __bch2_read_endio(struct work_struct *work)
container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- struct bio *src = &rbio->bio;
- struct bio *dst = &bch2_rbio_parent(rbio)->bio;
- struct bvec_iter dst_iter = rbio->bvec_iter;
+ struct bch_read_bio *parent = bch2_rbio_parent(rbio);
+ struct bio *src = &rbio->bio;
+ struct bio *dst = &parent->bio;
+ struct bvec_iter dst_iter = rbio->bvec_iter;
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
struct nonce nonce = extent_nonce(rbio->version, crc);
unsigned nofs_flags;
@@ -841,7 +740,7 @@ static void __bch2_read_endio(struct work_struct *work)
src->bi_iter = rbio->bvec_iter;
}
- maybe_corrupt_bio(src);
+ bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
@@ -853,7 +752,7 @@ static void __bch2_read_endio(struct work_struct *work)
*/
if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
rbio->flags |= BCH_READ_must_bounce;
- bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err_maybe_userspace,
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
BLK_STS_IOERR);
goto out;
}
@@ -873,7 +772,7 @@ static void __bch2_read_endio(struct work_struct *work)
if (unlikely(rbio->narrow_crcs))
bch2_rbio_narrow_crcs(rbio);
- if (likely(!(rbio->flags & BCH_READ_data_update))) {
+ if (likely(!parent->data_update)) {
/* Adjust crc to point to subset of data we want: */
crc.offset += rbio->offset_into_extent;
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
@@ -1043,6 +942,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bch_read_bio *rbio = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
+ struct data_update *u = rbio_data_update(orig);
int ret = 0;
if (bkey_extent_is_inline_data(k.k)) {
@@ -1106,16 +1006,7 @@ retry_pick:
goto retry_pick;
}
- /*
- * Unlock the iterator while the btree node's lock is still in
- * cache, before doing the IO:
- */
- if (!(flags & BCH_READ_in_retry))
- bch2_trans_unlock(trans);
- else
- bch2_trans_unlock_long(trans);
-
- if (!(flags & BCH_READ_data_update)) {
+ if (likely(!u)) {
if (!(flags & BCH_READ_last_fragment) ||
bio_flagged(&orig->bio, BIO_CHAIN))
flags |= BCH_READ_must_clone;
@@ -1138,12 +1029,10 @@ retry_pick:
bounce = true;
}
} else {
- read_full = true;
/*
* can happen if we retry, and the extent we were going to read
* has been merged in the meantime:
*/
- struct data_update *u = container_of(orig, struct data_update, rbio);
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
if (ca)
percpu_ref_put(&ca->io_ref);
@@ -1152,6 +1041,7 @@ retry_pick:
}
iter.bi_size = pick.crc.compressed_size << 9;
+ read_full = true;
}
if (orig->opts.promote_target || have_io_error(failed))
@@ -1242,10 +1132,14 @@ retry_pick:
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
+ /* XXX: also nvme read recovery level */
+ if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev)))
+ rbio->bio.bi_opf |= REQ_FUA;
+
if (rbio->bounce)
trace_and_count(c, io_read_bounce, &rbio->bio);
- if (!(flags & BCH_READ_data_update))
+ if (!u)
this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
else
this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
@@ -1255,7 +1149,7 @@ retry_pick:
* If it's being moved internally, we don't want to flag it as a cache
* hit:
*/
- if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update))
+ if (ca && pick.ptr.cached && !u)
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
PTR_BUCKET_NR(ca, &pick.ptr), READ);
@@ -1264,6 +1158,15 @@ retry_pick:
trace_and_count(c, io_read_split, &orig->bio);
}
+ /*
+ * Unlock the iterator while the btree node's lock is still in
+ * cache, before doing the IO:
+ */
+ if (!(flags & BCH_READ_in_retry))
+ bch2_trans_unlock(trans);
+ else
+ bch2_trans_unlock_long(trans);
+
if (likely(!rbio->pick.do_ec_reconstruct)) {
if (unlikely(!rbio->have_ioref)) {
struct printbuf buf = PRINTBUF;
@@ -1275,7 +1178,7 @@ retry_pick:
printbuf_exit(&buf);
bch2_rbio_error(rbio,
- -BCH_ERR_data_read_device_offline,
+ -BCH_ERR_data_read_retry_device_offline,
BLK_STS_IOERR);
goto out;
}
@@ -1302,7 +1205,7 @@ retry_pick:
} else {
/* Attempting reconstruct read: */
if (bch2_ec_read_extent(trans, rbio, k)) {
- bch2_rbio_error(rbio, -BCH_ERR_data_read_ec_reconstruct_err,
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
BLK_STS_IOERR);
goto out;
}
@@ -1314,6 +1217,8 @@ out:
if (likely(!(flags & BCH_READ_in_retry))) {
return 0;
} else {
+ bch2_trans_unlock(trans);
+
int ret;
rbio->context = RBIO_CONTEXT_UNBOUND;
@@ -1324,7 +1229,7 @@ out:
if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
bch2_mark_io_failure(failed, &pick,
- ret == -BCH_ERR_data_read_csum_err);
+ ret == -BCH_ERR_data_read_retry_csum_err);
return ret;
}
@@ -1341,11 +1246,11 @@ hole:
this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
bvec_iter_sectors(iter));
/*
- * won't normally happen in the BCH_READ_data_update
- * (bch2_move_extent()) path, but if we retry and the extent we wanted
- * to read no longer exists we have to signal that:
+ * won't normally happen in the data update (bch2_move_extent()) path,
+ * but if we retry and the extent we wanted to read no longer exists we
+ * have to signal that:
*/
- if (flags & BCH_READ_data_update)
+ if (u)
orig->ret = -BCH_ERR_data_read_key_overwritten;
zero_fill_bio_iter(&orig->bio, iter);
@@ -1366,7 +1271,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
struct bkey_s_c k;
int ret;
- BUG_ON(flags & BCH_READ_data_update);
+ EBUG_ON(rbio->data_update);
bch2_bkey_buf_init(&sk);
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
@@ -1393,23 +1298,6 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
if (ret)
goto err;
- if (unlikely(flags & BCH_READ_in_retry)) {
- struct data_update *u = flags & BCH_READ_data_update
- ? container_of(rbio, struct data_update, rbio)
- : NULL;
-
- if (u &&
- !bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
- /* extent we wanted to read no longer exists: */
- ret = -BCH_ERR_data_read_key_overwritten;
- goto err;
- }
-
- if (!bkey_deleted(&sk.k->k) &&
- !bkey_and_val_eq(k, bkey_i_to_s_c(sk.k)))
- failed->nr = 0;
- }
-
s64 offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
unsigned sectors = k.k->size - offset_into_extent;
@@ -1447,16 +1335,18 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
err:
+ if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
+ flags |= BCH_READ_must_bounce;
+
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
!bch2_err_matches(ret, BCH_ERR_data_read_retry))
break;
}
- if (unlikely(ret)) {
- if (ret == -BCH_ERR_no_device_to_read_from && failed)
- maybe_poison_extent(trans, &iter, k, failed);
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret) {
struct printbuf buf = PRINTBUF;
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf, inum,
@@ -1472,7 +1362,6 @@ err:
bch2_rbio_done(rbio);
}
- bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c);
return ret;
}
diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h
index edcf50a4..cd219504 100644
--- a/libbcachefs/io_read.h
+++ b/libbcachefs/io_read.h
@@ -36,7 +36,8 @@ struct bch_read_bio {
u16 flags;
union {
struct {
- u16 promote:1,
+ u16 data_update:1,
+ promote:1,
bounce:1,
split:1,
have_ioref:1,
@@ -109,7 +110,6 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
x(retry_if_stale) \
x(may_promote) \
x(user_mapped) \
- x(data_update) \
x(last_fragment) \
x(must_bounce) \
x(must_clone) \
@@ -163,12 +163,13 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
{
struct bch_read_bio *rbio = to_rbio(bio);
- rbio->c = orig->c;
- rbio->_state = 0;
- rbio->ret = 0;
- rbio->split = true;
- rbio->parent = orig;
- rbio->opts = orig->opts;
+ rbio->c = orig->c;
+ rbio->_state = 0;
+ rbio->flags = 0;
+ rbio->ret = 0;
+ rbio->split = true;
+ rbio->parent = orig;
+ rbio->opts = orig->opts;
return rbio;
}
@@ -182,7 +183,8 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
rbio->start_time = local_clock();
rbio->c = c;
rbio->_state = 0;
- rbio->ret = 0;
+ rbio->flags = 0;
+ rbio->ret = 0;
rbio->opts = opts;
rbio->bio.bi_end_io = end_io;
return rbio;
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index dbfcb28f..a2e6b305 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -34,6 +34,12 @@
#include <linux/random.h>
#include <linux/sched/mm.h>
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_write_corrupt_ratio;
+module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(write_corrupt_ratio, "");
+#endif
+
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
@@ -1005,6 +1011,15 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
bounce = true;
}
+#ifdef CONFIG_BCACHEFS_DEBUG
+ unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
+ if (!bounce && write_corrupt_ratio) {
+ dst = bch2_write_bio_alloc(c, wp, src,
+ &page_alloc_failed,
+ ec_buf);
+ bounce = true;
+ }
+#endif
saved_iter = dst->bi_iter;
do {
@@ -1114,6 +1129,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
init_append_extent(op, wp, version, crc);
+#ifdef CONFIG_BCACHEFS_DEBUG
+ if (write_corrupt_ratio) {
+ swap(dst->bi_iter.bi_size, dst_len);
+ bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
+ swap(dst->bi_iter.bi_size, dst_len);
+ }
+#endif
+
if (dst != src)
bio_advance(dst, dst_len);
bio_advance(src, src_len);
@@ -1394,6 +1417,7 @@ retry:
bio->bi_private = &op->cl;
bio->bi_opf |= REQ_OP_WRITE;
closure_get(&op->cl);
+
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
op->insert_keys.top, true);
@@ -1718,20 +1742,26 @@ static const char * const bch2_write_flags[] = {
void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
{
- prt_str(out, "pos: ");
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
+ prt_printf(out, "pos:\t");
bch2_bpos_to_text(out, op->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_str(out, "started: ");
+ prt_printf(out, "started:\t");
bch2_pr_time_units(out, local_clock() - op->start_time);
prt_newline(out);
- prt_str(out, "flags: ");
+ prt_printf(out, "flags:\t");
prt_bitflags(out, bch2_write_flags, op->flags);
prt_newline(out);
- prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
+ prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
+ prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
+
+ prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
printbuf_indent_sub(out, 2);
}
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 331c9d76..cf2700b0 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1609,11 +1609,6 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
kvfree(new_buf);
}
-static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
-{
- return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
-}
-
static CLOSURE_CALLBACK(journal_write_done)
{
closure_type(w, struct journal_buf, io);
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index a3096e2a..55e17c2d 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -101,13 +101,25 @@ static void move_free(struct moving_io *io)
static void move_write_done(struct bch_write_op *op)
{
struct moving_io *io = container_of(op, struct moving_io, write.op);
+ struct bch_fs *c = op->c;
struct moving_context *ctxt = io->write.ctxt;
- if (io->write.op.error)
+ if (op->error) {
+ if (trace_io_move_write_fail_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_write_op_to_text(&buf, op);
+ prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error));
+ trace_io_move_write_fail(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+ this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
+
ctxt->write_error = true;
+ }
- atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
- atomic_dec(&io->write.ctxt->write_ios);
+ atomic_sub(io->write_sectors, &ctxt->write_sectors);
+ atomic_dec(&ctxt->write_ios);
move_free(io);
closure_put(&ctxt->cl);
}
@@ -359,7 +371,6 @@ int bch2_move_extent(struct moving_context *ctxt,
bkey_start_pos(k.k),
iter->btree_id, k, 0,
NULL,
- BCH_READ_data_update|
BCH_READ_last_fragment,
data_opts.scrub ? data_opts.read_dev : -1);
return 0;
@@ -580,7 +591,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
k.k->type == KEY_TYPE_reflink_p &&
REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+ s64 offset_into_extent = 0;
bch2_trans_iter_exit(trans, &reflink_iter);
k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
@@ -599,6 +610,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
* pointer - need to fixup iter->k
*/
extent_iter = &reflink_iter;
+ offset_into_extent = 0;
}
if (!bkey_extent_is_direct_data(k.k))
@@ -712,7 +724,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
struct btree_iter iter = {}, bp_iter = {};
struct bkey_buf sk;
struct bkey_s_c k;
- unsigned sectors_moved = 0;
struct bkey_buf last_flushed;
int ret = 0;
@@ -834,7 +845,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (ctxt->stats)
atomic64_add(sectors, &ctxt->stats->sectors_seen);
- sectors_moved += sectors;
next:
bch2_btree_iter_advance(&bp_iter);
}
@@ -1253,17 +1263,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved));
- prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced));
- prt_printf(out, "bytes seen: ");
+ prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved));
+ prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced));
+ prt_printf(out, "bytes seen:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
prt_newline(out);
- prt_printf(out, "bytes moved: ");
+ prt_printf(out, "bytes moved:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out);
- prt_printf(out, "bytes raced: ");
+ prt_printf(out, "bytes raced:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
prt_newline(out);
@@ -1272,7 +1282,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
{
- struct moving_io *io;
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
@@ -1292,6 +1303,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
printbuf_indent_add(out, 2);
mutex_lock(&ctxt->lock);
+ struct moving_io *io;
list_for_each_entry(io, &ctxt->ios, io_list)
bch2_data_update_inflight_to_text(out, &io->write);
mutex_unlock(&ctxt->lock);
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index fa19fc44..5126c870 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
prt_printf(out, "Currently calculated wait:\t");
prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
prt_newline(out);
+
+ rcu_read_lock();
+ struct task_struct *t = rcu_dereference(c->copygc_thread);
+ if (t)
+ get_task_struct(t);
+ rcu_read_unlock();
+
+ if (t) {
+ bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
+ put_task_struct(t);
+ }
}
static int bch2_copygc_thread(void *arg)
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index afb89d31..baa9c11a 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -186,6 +186,11 @@ enum fsck_err_opts {
OPT_STR(__bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
+ x(checksum_err_retry_nr, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(0, 32), \
+ BCH_SB_CSUM_ERR_RETRY_NR, 3, \
+ NULL, NULL) \
x(compression, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_compression), \
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index 58f6d97e..29a56938 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -26,9 +26,8 @@
/* bch_extent_rebalance: */
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
bkey_extent_entry_for_each(ptrs, entry)
@@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s
return NULL;
}
+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+ return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
+}
+
static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
struct bch_io_opts *opts,
struct bkey_s_c k,
@@ -97,11 +101,12 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
{
- const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
if (!opts)
return 0;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 sectors = 0;
@@ -590,8 +595,19 @@ static int bch2_rebalance_thread(void *arg)
void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
{
+ printbuf_tabstop_push(out, 32);
+
struct bch_fs_rebalance *r = &c->rebalance;
+ /* print pending work */
+ struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, };
+ u64 v;
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
+
+ prt_printf(out, "pending work:\t");
+ prt_human_readable_u64(out, v);
+ prt_printf(out, "\n\n");
+
prt_str(out, bch2_rebalance_state_strs[r->state]);
prt_newline(out);
printbuf_indent_add(out, 2);
@@ -600,15 +616,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
case BCH_REBALANCE_waiting: {
u64 now = atomic64_read(&c->io_clock[WRITE].now);
- prt_str(out, "io wait duration: ");
+ prt_printf(out, "io wait duration:\t");
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
prt_newline(out);
- prt_str(out, "io wait remaining: ");
+ prt_printf(out, "io wait remaining:\t");
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
prt_newline(out);
- prt_str(out, "duration waited: ");
+ prt_printf(out, "duration waited:\t");
bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
prt_newline(out);
break;
@@ -621,6 +637,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
break;
}
prt_newline(out);
+
+ rcu_read_lock();
+ struct task_struct *t = rcu_dereference(c->rebalance.thread);
+ if (t)
+ get_task_struct(t);
+ rcu_read_unlock();
+
+ if (t) {
+ bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
+ put_task_struct(t);
+ }
+
printbuf_indent_sub(out, 2);
}
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 71c786cd..a6e26733 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -899,7 +899,7 @@ use_clean:
* journal sequence numbers:
*/
if (!c->sb.clean)
- journal_seq += 8;
+ journal_seq += JOURNAL_BUF_NR * 4;
if (blacklist_seq != journal_seq) {
ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h
index c82a8910..fa27ec59 100644
--- a/libbcachefs/sb-counters_format.h
+++ b/libbcachefs/sb-counters_format.h
@@ -22,6 +22,7 @@ enum counters_flags {
x(io_move_write, 36, TYPE_SECTORS) \
x(io_move_finish, 37, TYPE_SECTORS) \
x(io_move_fail, 38, TYPE_COUNTER) \
+ x(io_move_write_fail, 82, TYPE_COUNTER) \
x(io_move_start_fail, 39, TYPE_COUNTER) \
x(bucket_invalidate, 3, TYPE_COUNTER) \
x(bucket_discard, 4, TYPE_COUNTER) \
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index f645a454..575ad1e0 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -12,7 +12,6 @@
#include "super.h"
#include <linux/crc32c.h>
-#include <crypto/hash.h>
#include <crypto/sha2.h>
static inline enum bch_str_hash_type
@@ -55,13 +54,10 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
};
if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
- SHASH_DESC_ON_STACK(desc, c->sha256);
u8 digest[SHA256_DIGEST_SIZE];
- desc->tfm = c->sha256;
-
- crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
- sizeof(bi->bi_hash_seed), digest);
+ sha256((const u8 *)&bi->bi_hash_seed,
+ sizeof(bi->bi_hash_seed), digest);
memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
}
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index ee32d043..f2e44282 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -365,10 +365,9 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
return 0;
}
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
- enum bch_validate_flags flags, struct printbuf *out)
+int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
+ enum bch_validate_flags flags, struct printbuf *out)
{
- struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id;
int ret;
@@ -377,15 +376,27 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (ret)
return ret;
- if (sb->features[1] ||
- (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
- prt_printf(out, "Filesystem has incompatible features");
+ u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
+ unsigned incompat_bit = 0;
+ if (incompat)
+ incompat_bit = __ffs64(incompat);
+ else if (sb->features[1])
+ incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
+
+ if (incompat_bit) {
+ prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
+ incompat_bit,
+ bch2_sb_features[BCH_FEATURE_NR - 1],
+ BCH_FEATURE_NR - 1);
return -BCH_ERR_invalid_sb_features;
}
if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
- prt_printf(out, "Filesystem has incompatible version");
+ prt_str(out, "Filesystem has incompatible version ");
+ bch2_version_to_text(out, le16_to_cpu(sb->version));
+ prt_str(out, ", current version ");
+ bch2_version_to_text(out, bcachefs_metadata_version_current);
return -BCH_ERR_invalid_sb_features;
}
@@ -399,6 +410,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
return -BCH_ERR_invalid_sb_uuid;
}
+ if (!(flags & BCH_VALIDATE_write) &&
+ le64_to_cpu(sb->offset) != read_offset) {
+ prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
+ le64_to_cpu(sb->offset), read_offset);
+ return -BCH_ERR_invalid_sb_offset;
+ }
+
if (!sb->nr_devices ||
sb->nr_devices > BCH_SB_MEMBERS_MAX) {
prt_printf(out, "Bad number of member devices %u (max %u)",
@@ -457,6 +475,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
+
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
+ !BCH_SB_CSUM_ERR_RETRY_NR(sb))
+ SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
}
#ifdef __KERNEL__
@@ -874,7 +896,7 @@ got_super:
sb->have_layout = true;
- ret = bch2_sb_validate(sb, 0, &err);
+ ret = bch2_sb_validate(sb->sb, offset, 0, &err);
if (ret) {
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
@@ -1031,7 +1053,7 @@ int bch2_write_super(struct bch_fs *c)
darray_for_each(online_devices, ca) {
printbuf_reset(&err);
- ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err);
+ ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
goto out;
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index 167dd98f..78f708a6 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
+int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
+
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_write_super(struct bch_fs *);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index cffad3b6..8e928b3d 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -75,9 +75,6 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
MODULE_DESCRIPTION("bcachefs filesystem");
-MODULE_SOFTDEP("pre: crc32c");
-MODULE_SOFTDEP("pre: crc64");
-MODULE_SOFTDEP("pre: sha256");
MODULE_SOFTDEP("pre: chacha20");
MODULE_SOFTDEP("pre: poly1305");
MODULE_SOFTDEP("pre: xxhash");
@@ -1838,7 +1835,11 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_late;
up_write(&c->state_lock);
- return 0;
+out:
+ printbuf_exit(&label);
+ printbuf_exit(&errbuf);
+ bch_err_fn(c, ret);
+ return ret;
err_unlock:
mutex_unlock(&c->sb_lock);
@@ -1847,10 +1848,7 @@ err:
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
- printbuf_exit(&label);
- printbuf_exit(&errbuf);
- bch_err_fn(c, ret);
- return ret;
+ goto out;
err_late:
up_write(&c->state_lock);
ca = NULL;
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 2ed3f755..5b8463ae 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -148,6 +148,7 @@ write_attribute(trigger_btree_key_cache_shrink);
write_attribute(trigger_freelist_wakeup);
write_attribute(trigger_btree_updates);
read_attribute(gc_gens_pos);
+write_attribute(read_fua_test);
read_attribute(uuid);
read_attribute(minor);
@@ -395,6 +396,71 @@ SHOW(bch2_fs)
return 0;
}
+static int read_fua_test(struct bch_fs *c)
+{
+ int ret = 0;
+ unsigned bs = 4096;
+ struct bio *bio;
+ void *buf;
+
+ struct bch_dev *ca = bch2_dev_get_ioref(c, 0, READ);
+ if (!ca)
+ return -EINVAL;
+
+ bio = bio_kmalloc(1, GFP_KERNEL);
+ if (!bio) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ buf = kmalloc(bs, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ u64 start = ktime_get_ns();
+ for (unsigned i = 0; i < 1000; i++) {
+ bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, READ);
+ bch2_bio_map(bio, buf, bs);
+ ret = submit_bio_wait(bio);
+ if (ret)
+ goto err;
+ }
+ u64 ns_nofua = ktime_get_ns() - start;
+
+ start = ktime_get_ns();
+ for (unsigned i = 0; i < 1000; i++) {
+ bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ);
+ bch2_bio_map(bio, buf, bs);
+ ret = submit_bio_wait(bio);
+ if (ret)
+ goto err;
+ }
+ u64 ns_fua = ktime_get_ns() - start;
+
+ u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca);
+
+ start = ktime_get_ns();
+ for (unsigned i = 0; i < 1000; i++) {
+ bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, READ);
+ bio->bi_iter.bi_sector = (get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9;
+ bch2_bio_map(bio, buf, bs);
+ ret = submit_bio_wait(bio);
+ if (ret)
+ goto err;
+ }
+ u64 ns_rand = ktime_get_ns() - start;
+
+ pr_info("ns nofua %llu", ns_nofua);
+ pr_info("ns fua %llu", ns_fua);
+ pr_info("ns random %llu", ns_rand);
+err:
+ kfree(buf);
+ kfree(bio);
+ percpu_ref_put(&ca->io_ref);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
STORE(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -451,6 +517,9 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_freelist_wakeup)
closure_wake_up(&c->freelist_wait);
+ if (attr == &sysfs_read_fua_test)
+ read_fua_test(c);
+
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -580,6 +649,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_btree_key_cache_shrink,
&sysfs_trigger_freelist_wakeup,
&sysfs_trigger_btree_updates,
+ &sysfs_read_fua_test,
&sysfs_gc_gens_pos,
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
index c8669a6b..519d00d6 100644
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@@ -846,6 +846,11 @@ DEFINE_EVENT(fs_str, io_move_fail,
TP_ARGS(c, str)
);
+DEFINE_EVENT(fs_str, io_move_write_fail,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
DEFINE_EVENT(fs_str, io_move_start_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 50a90e48..bf555ae7 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -653,21 +653,6 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
return 0;
}
-size_t bch2_rand_range(size_t max)
-{
- size_t rand;
-
- if (!max)
- return 0;
-
- do {
- rand = get_random_long();
- rand &= roundup_pow_of_two(max) - 1;
- } while (rand >= max);
-
- return rand;
-}
-
void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
{
struct bio_vec bv;
@@ -698,6 +683,27 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_corrupt_bio(struct bio *bio)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
+
+ bio_for_each_segment(bv, bio, iter) {
+ unsigned u64s = bv.bv_len / sizeof(u64);
+
+ if (offset < u64s) {
+ u64 *segment = bvec_kmap_local(&bv);
+ segment[offset] = get_random_u64();
+ kunmap_local(segment);
+ return;
+ }
+ offset -= u64s;
+ }
+}
+#endif
+
#if 0
void eytzinger1_test(void)
{
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index e7c3541b..f0e360eb 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -401,11 +401,21 @@ do { \
_ret; \
})
-size_t bch2_rand_range(size_t);
-
void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_corrupt_bio(struct bio *);
+
+static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
+{
+ if (ratio && !get_random_u32_below(ratio))
+ bch2_corrupt_bio(bio);
+}
+#else
+#define bch2_maybe_corrupt_bio(...) do {} while (0)
+#endif
+
static inline void memcpy_u64s_small(void *dst, const void *src,
unsigned u64s)
{