summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2025-03-10 14:22:35 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2025-03-16 16:09:01 -0400
commit6cbadc946dbc204f284615521ab29c18c5a0a762 (patch)
tree4e06b204b6cd3a3ea61e4aae0cccfb3665c860e2
parent86cbeaf1c2b76b1b1d9e440d2f021df66d868b2e (diff)
Update bcachefs sources to 46af7258b951 bcachefs: BCH_SB_FEATURES_ALL includes BCH_FEATURE_incompat_verison_field
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/linux/blk_types.h4
-rw-r--r--include/linux/blkdev.h9
-rw-r--r--include/linux/dcache.h2
-rw-r--r--libbcachefs/bcachefs.h5
-rw-r--r--libbcachefs/bcachefs_format.h7
-rw-r--r--libbcachefs/btree_cache.c9
-rw-r--r--libbcachefs/btree_io.c87
-rw-r--r--libbcachefs/btree_key_cache.c2
-rw-r--r--libbcachefs/btree_locking.c5
-rw-r--r--libbcachefs/btree_locking.h2
-rw-r--r--libbcachefs/btree_node_scan.c29
-rw-r--r--libbcachefs/btree_update_interior.c128
-rw-r--r--libbcachefs/btree_update_interior.h5
-rw-r--r--libbcachefs/data_update.c27
-rw-r--r--libbcachefs/dirent_format.h4
-rw-r--r--libbcachefs/ec.c377
-rw-r--r--libbcachefs/ec.h15
-rw-r--r--libbcachefs/ec_types.h7
-rw-r--r--libbcachefs/errcode.h20
-rw-r--r--libbcachefs/error.c34
-rw-r--r--libbcachefs/error.h52
-rw-r--r--libbcachefs/extents.c141
-rw-r--r--libbcachefs/extents.h21
-rw-r--r--libbcachefs/extents_format.h24
-rw-r--r--libbcachefs/extents_types.h12
-rw-r--r--libbcachefs/fs-common.c13
-rw-r--r--libbcachefs/fs-io.c1
-rw-r--r--libbcachefs/fs-ioctl.c7
-rw-r--r--libbcachefs/fs.c11
-rw-r--r--libbcachefs/fsck.c21
-rw-r--r--libbcachefs/io_read.c382
-rw-r--r--libbcachefs/io_read.h21
-rw-r--r--libbcachefs/io_write.c12
-rw-r--r--libbcachefs/io_write.h6
-rw-r--r--libbcachefs/journal.c57
-rw-r--r--libbcachefs/journal_io.c55
-rw-r--r--libbcachefs/journal_types.h1
-rw-r--r--libbcachefs/move.c6
-rw-r--r--libbcachefs/movinggc.c25
-rw-r--r--libbcachefs/opts.h5
-rw-r--r--libbcachefs/recovery_passes_types.h2
-rw-r--r--libbcachefs/reflink.c2
-rw-r--r--libbcachefs/sb-downgrade.c3
-rw-r--r--libbcachefs/sb-errors_format.h3
-rw-r--r--libbcachefs/sb-members.h4
-rw-r--r--libbcachefs/six.c5
-rw-r--r--libbcachefs/six.h7
-rw-r--r--libbcachefs/super-io.c52
-rw-r--r--libbcachefs/super-io.h15
-rw-r--r--libbcachefs/super.c104
-rw-r--r--libbcachefs/super.h2
-rw-r--r--libbcachefs/super_types.h8
-rw-r--r--libbcachefs/sysfs.c5
-rw-r--r--libbcachefs/trace.h24
-rw-r--r--linux/blkdev.c2
56 files changed, 1097 insertions, 794 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 9f202c51..7d7555ff 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-9736cbbc5cc39f6c666befdd787788b6ce6497f6
+46af7258b951a79a66511172ab8772ad2dfaa4e3
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 2384a5e3..e3c3a9b4 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -10,6 +10,8 @@
#include <linux/types.h>
#include <linux/bvec.h>
#include <linux/kobject.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
struct bio_set;
struct bio;
@@ -63,6 +65,8 @@ struct block_device {
struct gendisk * bd_disk;
struct gendisk __bd_disk;
int bd_fd;
+
+ struct mutex bd_holder_lock;
};
#define bdev_kobj(_bdev) (&((_bdev)->kobj))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b295bd9a..6964396e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -65,7 +65,10 @@ unsigned bdev_logical_block_size(struct block_device *bdev);
sector_t get_capacity(struct gendisk *disk);
struct blk_holder_ops {
- void (*mark_dead)(struct block_device *bdev);
+ void (*mark_dead)(struct block_device *bdev, bool surprise);
+ void (*sync)(struct block_device *bdev);
+ int (*freeze)(struct block_device *bdev);
+ int (*thaw)(struct block_device *bdev);
};
static inline struct block_device *file_bdev(struct file *file)
@@ -80,8 +83,12 @@ int lookup_bdev(const char *path, dev_t *);
struct super_block {
void *s_fs_info;
+ struct rw_semaphore s_umount;
};
+static inline void evict_inodes(struct super_block *sb) {}
+static inline int sync_filesystem(struct super_block *) { return 0; }
+
/*
* File types
*
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f395ce7f..b9d0ea22 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -9,6 +9,8 @@ struct dentry {
struct inode *d_inode;
};
+static inline void shrink_dcache_sb(struct super_block *) {}
+
#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n))
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index d2c3f59a..b432bb6e 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -536,6 +536,7 @@ struct bch_dev {
*/
struct bch_member_cpu mi;
atomic64_t errors[BCH_MEMBER_ERROR_NR];
+ unsigned long write_errors_start;
__uuid_t uuid;
char name[BDEVNAME_SIZE];
@@ -1002,15 +1003,11 @@ struct bch_fs {
wait_queue_head_t copygc_running_wq;
/* STRIPES: */
- GENRADIX(struct stripe) stripes;
GENRADIX(struct gc_stripe) gc_stripes;
struct hlist_head ec_stripes_new[32];
spinlock_t ec_stripes_new_lock;
- ec_stripes_heap ec_stripes_heap;
- struct mutex ec_stripes_heap_lock;
-
/* ERASURE CODING */
struct list_head ec_stripe_head_list;
struct mutex ec_stripe_head_lock;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 13cc0833..7a5b0d21 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -690,7 +690,8 @@ struct bch_sb_field_ext {
x(cached_backpointers, BCH_VERSION(1, 21)) \
x(stripe_backpointers, BCH_VERSION(1, 22)) \
x(stripe_lru, BCH_VERSION(1, 23)) \
- x(casefolding, BCH_VERSION(1, 24))
+ x(casefolding, BCH_VERSION(1, 24)) \
+ x(extent_flags, BCH_VERSION(1, 25))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -859,6 +860,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
struct bch_sb, flags[5], 48, 64);
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
+LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
@@ -927,7 +929,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
BIT_ULL(BCH_FEATURE_new_siphash)| \
BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \
BIT_ULL(BCH_FEATURE_new_varint)| \
- BIT_ULL(BCH_FEATURE_journal_no_flush))
+ BIT_ULL(BCH_FEATURE_journal_no_flush)| \
+ BIT_ULL(BCH_FEATURE_incompat_version_field))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index ca755e8d..1ec1f90e 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -203,7 +203,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
return NULL;
}
- bch2_btree_lock_init(&b->c, 0);
+ bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
__bch2_btree_node_to_freelist(bc, b);
return b;
@@ -795,17 +795,18 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
}
b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
- if (!b) {
+ if (b) {
+ bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
+ } else {
mutex_unlock(&bc->lock);
bch2_trans_unlock(trans);
b = __btree_node_mem_alloc(c, GFP_KERNEL);
if (!b)
goto err;
+ bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
mutex_lock(&bc->lock);
}
- bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
-
BUG_ON(!six_trylock_intent(&b->c.lock));
BUG_ON(!six_trylock_write(&b->c.lock));
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 80a0094b..6638bb1f 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1187,7 +1187,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
le64_to_cpu(i->journal_seq),
b->written, b->written + sectors, ptr_written);
- b->written += sectors;
+ b->written = min(b->written + sectors, btree_sectors(c));
if (blacklisted && !first)
continue;
@@ -1329,6 +1329,7 @@ static void btree_node_read_work(struct work_struct *work)
bch_info(c, "retrying read");
ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
rb->have_ioref = ca != NULL;
+ rb->start_time = local_clock();
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_buf_bytes(b);
@@ -1339,17 +1340,22 @@ static void btree_node_read_work(struct work_struct *work)
} else {
bio->bi_status = BLK_STS_REMOVED;
}
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ rb->start_time, !bio->bi_status);
start:
printbuf_reset(&buf);
bch2_btree_pos_to_text(&buf, c, b);
- bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
- "btree read error %s for %s",
- bch2_blk_status_to_str(bio->bi_status), buf.buf);
+
+ if (ca && bio->bi_status)
+ bch_err_dev_ratelimited(ca,
+ "btree read error %s for %s",
+ bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
rb->have_ioref = false;
- bch2_mark_io_failure(&failed, &rb->pick);
+ bch2_mark_io_failure(&failed, &rb->pick, false);
can_retry = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
@@ -1401,12 +1407,11 @@ static void btree_node_read_endio(struct bio *bio)
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
struct bch_fs *c = rb->c;
+ struct bch_dev *ca = rb->have_ioref
+ ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
- if (rb->have_ioref) {
- struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
-
- bch2_latency_acct(ca, rb->start_time, READ);
- }
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ rb->start_time, !bio->bi_status);
queue_work(c->btree_read_complete_wq, &rb->work);
}
@@ -2075,6 +2080,11 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
+ unsigned commit_flags =
+ BCH_WATERMARK_interior_updates|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw;
u64 start_time = wbio->start_time;
int ret = 0;
@@ -2083,38 +2093,24 @@ static void btree_node_write_work(struct work_struct *work)
wbio->wbio.used_mempool,
wbio->data);
- bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
- bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
-
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
- ret = -BCH_ERR_btree_node_write_all_failed;
- goto err;
- }
-
- if (wbio->wbio.first_btree_write) {
- if (wbio->wbio.failed.nr) {
-
- }
- } else {
+ if (wbio->wbio.failed.nr) {
+ ret = bch2_trans_do(c,
+ bch2_btree_node_rewrite_key_get_iter(trans, b,
+ commit_flags));
+ } else if (!wbio->wbio.first_btree_write) {
ret = bch2_trans_do(c,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
- BCH_WATERMARK_interior_updates|
- BCH_TRANS_COMMIT_journal_reclaim|
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_check_rw,
- !wbio->wbio.failed.nr));
- if (ret)
- goto err;
+ commit_flags, true));
}
-out:
+
+ if (ret) {
+ set_btree_node_noevict(b);
+ bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
+ "writing btree node: %s", bch2_err_str(ret));
+ }
+
bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b, start_time);
- return;
-err:
- set_btree_node_noevict(b);
- bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
- "writing btree node: %s", bch2_err_str(ret));
- goto out;
}
static void btree_node_write_endio(struct bio *bio)
@@ -2126,16 +2122,17 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct btree *b = wbio->bio.bi_private;
struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
- unsigned long flags;
- if (wbio->have_ioref)
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+ wbio->submit_time, !bio->bi_status);
+
+ if (ca && bio->bi_status)
+ bch_err_dev_ratelimited(ca,
+ "btree write error: %s",
+ bch2_blk_status_to_str(bio->bi_status));
- if (!ca ||
- bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
- "btree write error: %s",
- bch2_blk_status_to_str(bio->bi_status)) ||
- bch2_meta_write_fault("btree")) {
+ if (bio->bi_status) {
+ unsigned long flags;
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bch2_dev_list_add_dev(&orig->failed, wbio->dev);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 1821f40c..edce5943 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -156,7 +156,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
}
if (ck) {
- bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+ bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
ck->c.cached = true;
goto lock;
}
diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c
index 10b805a6..caef65ad 100644
--- a/libbcachefs/btree_locking.c
+++ b/libbcachefs/btree_locking.c
@@ -7,9 +7,10 @@
static struct lock_class_key bch2_btree_node_lock_key;
void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
- enum six_lock_init_flags flags)
+ enum six_lock_init_flags flags,
+ gfp_t gfp)
{
- __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
+ __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp);
lockdep_set_notrack_class(&b->lock);
}
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index b54ef48e..b33ab7af 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -13,7 +13,7 @@
#include "btree_iter.h"
#include "six.h"
-void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp);
void bch2_trans_unlock_noassert(struct btree_trans *);
void bch2_trans_unlock_write(struct btree_trans *);
diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c
index a7f06dee..67816132 100644
--- a/libbcachefs/btree_node_scan.c
+++ b/libbcachefs/btree_node_scan.c
@@ -166,11 +166,17 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, bn, PAGE_SIZE);
+ u64 submit_time = local_clock();
submit_bio_wait(bio);
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
- "IO error in try_read_btree_node() at %llu: %s",
- offset, bch2_blk_status_to_str(bio->bi_status)))
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
+
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca,
+ "IO error in try_read_btree_node() at %llu: %s",
+ offset, bch2_blk_status_to_str(bio->bi_status));
return;
+ }
if (le64_to_cpu(bn->magic) != bset_magic(c))
return;
@@ -264,7 +270,7 @@ static int read_btree_nodes_worker(void *p)
err:
bio_put(bio);
free_page((unsigned long) buf);
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref);
closure_put(w->cl);
kfree(w);
return 0;
@@ -283,29 +289,28 @@ static int read_btree_nodes(struct find_btree_nodes *f)
continue;
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
- struct task_struct *t;
-
if (!w) {
percpu_ref_put(&ca->io_ref);
ret = -ENOMEM;
goto err;
}
- percpu_ref_get(&ca->io_ref);
- closure_get(&cl);
w->cl = &cl;
w->f = f;
w->ca = ca;
- t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+ struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
ret = PTR_ERR_OR_ZERO(t);
if (ret) {
percpu_ref_put(&ca->io_ref);
- closure_put(&cl);
- f->ret = ret;
- bch_err(c, "error starting kthread: %i", ret);
+ kfree(w);
+ bch_err_msg(c, ret, "starting kthread");
break;
}
+
+ closure_get(&cl);
+ percpu_ref_get(&ca->io_ref);
+ wake_up_process(t);
}
err:
closure_sync(&cl);
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index aac2947a..d3e0cf01 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -2126,6 +2126,31 @@ err_free_update:
goto out;
}
+static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
+ struct btree *b)
+{
+ bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
+ BTREE_MAX_DEPTH, b->c.level,
+ BTREE_ITER_intent);
+ int ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ goto err;
+
+ /* has node been freed? */
+ if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
+ /* node has been freed: */
+ BUG_ON(!btree_node_dying(b));
+ ret = -BCH_ERR_btree_node_dying;
+ goto err;
+ }
+
+ BUG_ON(!btree_node_hashed(b));
+ return 0;
+err:
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
+}
+
int bch2_btree_node_rewrite(struct btree_trans *trans,
struct btree_iter *iter,
struct btree *b,
@@ -2191,7 +2216,29 @@ err:
goto out;
}
-int bch2_btree_node_rewrite_key(struct btree_trans *trans,
+static int bch2_btree_node_rewrite_key(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_i *k, unsigned flags)
+{
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter,
+ btree, k->k.p,
+ BTREE_MAX_DEPTH, level, 0);
+ struct btree *b = bch2_btree_iter_peek_node(&iter);
+ int ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto out;
+
+ bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
+ ret = found
+ ? bch2_btree_node_rewrite(trans, &iter, b, flags)
+ : -ENOENT;
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bpos pos, unsigned flags)
{
@@ -2211,6 +2258,19 @@ err:
return ret;
}
+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
+ struct btree *b, unsigned flags)
+{
+ struct btree_iter iter;
+ int ret = get_iter_to_node(trans, &iter, b);
+ if (ret)
+ return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
struct async_btree_rewrite {
struct bch_fs *c;
struct work_struct work;
@@ -2220,57 +2280,14 @@ struct async_btree_rewrite {
struct bkey_buf key;
};
-static int async_btree_node_rewrite_trans(struct btree_trans *trans,
- struct async_btree_rewrite *a)
-{
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter,
- a->btree_id, a->key.k->k.p,
- BTREE_MAX_DEPTH, a->level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
- int ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- goto out;
-
- bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
- ret = found
- ? bch2_btree_node_rewrite(trans, &iter, b, 0)
- : -ENOENT;
-
-#if 0
- /* Tracepoint... */
- if (!ret || ret == -ENOENT) {
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
-
- if (!ret) {
- prt_printf(&buf, "rewrite node:\n ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
- } else {
- prt_printf(&buf, "node to rewrite not found:\n want: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
- prt_printf(&buf, "\n got: ");
- if (b)
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- else
- prt_str(&buf, "(null)");
- }
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-#endif
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static void async_btree_node_rewrite_work(struct work_struct *work)
{
struct async_btree_rewrite *a =
container_of(work, struct async_btree_rewrite, work);
struct bch_fs *c = a->c;
- int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
+ int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
+ a->btree_id, a->level, a->key.k, 0));
if (ret != -ENOENT)
bch_err_fn_ratelimited(c, ret);
@@ -2514,30 +2531,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
unsigned commit_flags, bool skip_triggers)
{
struct btree_iter iter;
- int ret;
-
- bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
- BTREE_MAX_DEPTH, b->c.level,
- BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter);
+ int ret = get_iter_to_node(trans, &iter, b);
if (ret)
- goto out;
-
- /* has node been freed? */
- if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
- /* node has been freed: */
- BUG_ON(!btree_node_dying(b));
- goto out;
- }
-
- BUG_ON(!btree_node_hashed(b));
+ return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
!bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
commit_flags, skip_triggers);
-out:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index b5be250b..be71cd73 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -169,9 +169,12 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
struct btree *, unsigned);
-int bch2_btree_node_rewrite_key(struct btree_trans *,
+int bch2_btree_node_rewrite_pos(struct btree_trans *,
enum btree_id, unsigned,
struct bpos, unsigned);
+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
+ struct btree *, unsigned);
+
void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 7e484afe..522574bc 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -573,7 +573,6 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
prt_str_indented(out, "extra replicas:\t");
prt_u64(out, data_opts->extra_replicas);
- prt_newline(out);
}
void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
@@ -707,6 +706,18 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
return 0;
}
+static bool can_write_extent(struct bch_fs *c,
+ struct bch_devs_list *devs_have,
+ unsigned target)
+{
+ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
+
+ darray_for_each(*devs_have, i)
+ __clear_bit(*i, devs.d);
+
+ return !bch2_is_zero(&devs, sizeof(devs));
+}
+
int bch2_data_update_init(struct btree_trans *trans,
struct btree_iter *iter,
struct moving_context *ctxt,
@@ -788,6 +799,20 @@ int bch2_data_update_init(struct btree_trans *trans,
ptr_bit <<= 1;
}
+ if (!can_write_extent(c, &m->op.devs_have,
+ m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) {
+ /*
+ * Check if we have rw devices not in devs_have: this can happen
+ * if we're trying to move data on a ro or failed device
+ *
+ * If we can't move it, we need to clear the rebalance_work bit,
+ * if applicable
+ *
+ * Also, copygc should skip ro/failed devices:
+ */
+ return -BCH_ERR_data_update_done_no_rw_devs;
+ }
+
unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
/*
diff --git a/libbcachefs/dirent_format.h b/libbcachefs/dirent_format.h
index 2e766032..a46dbddd 100644
--- a/libbcachefs/dirent_format.h
+++ b/libbcachefs/dirent_format.h
@@ -44,9 +44,9 @@ struct bch_dirent {
__u8 d_pad;
__le16 d_name_len;
__le16 d_cf_name_len;
- __u8 d_names[0];
+ __u8 d_names[];
} d_cf_name_block __packed;
- __u8 d_name[0];
+ __DECLARE_FLEX_ARRAY(__u8, d_name);
} __packed;
} __packed __aligned(8);
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 1090cdb7..865cc53a 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -105,6 +105,7 @@ struct ec_bio {
struct bch_dev *ca;
struct ec_stripe_buf *buf;
size_t idx;
+ u64 submit_time;
struct bio bio;
};
@@ -494,38 +495,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
return ret;
}
- if (flags & BTREE_TRIGGER_atomic) {
- struct stripe *m = genradix_ptr(&c->stripes, idx);
-
- if (!m) {
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf1, c, old);
- bch2_bkey_val_to_text(&buf2, c, new);
- bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
- "old %s\n"
- "new %s", idx, buf1.buf, buf2.buf);
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- bch2_inconsistent_error(c);
- return -1;
- }
-
- if (!new_s) {
- bch2_stripes_heap_del(c, m, idx);
-
- memset(m, 0, sizeof(*m));
- } else {
- stripe_to_mem(m, new_s);
-
- if (!old_s)
- bch2_stripes_heap_insert(c, m, idx);
- else
- bch2_stripes_heap_update(c, m, idx);
- }
- }
-
return 0;
}
@@ -748,14 +717,15 @@ static void ec_block_endio(struct bio *bio)
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
- if (bch2_dev_io_err_on(bio->bi_status, ca,
- bio_data_dir(bio)
- ? BCH_MEMBER_ERROR_write
- : BCH_MEMBER_ERROR_read,
- "erasure coding %s error: %s",
+ bch2_account_io_completion(ca, bio_data_dir(bio),
+ ec_bio->submit_time, !bio->bi_status);
+
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
str_write_read(bio_data_dir(bio)),
- bch2_blk_status_to_str(bio->bi_status)))
+ bch2_blk_status_to_str(bio->bi_status));
clear_bit(ec_bio->idx, ec_bio->buf->valid);
+ }
int stale = dev_ptr_stale(ca, ptr);
if (stale) {
@@ -818,6 +788,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
ec_bio->ca = ca;
ec_bio->buf = buf;
ec_bio->idx = idx;
+ ec_bio->submit_time = local_clock();
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
ec_bio->bio.bi_end_io = ec_block_endio;
@@ -939,26 +910,6 @@ err:
static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
{
- ec_stripes_heap n, *h = &c->ec_stripes_heap;
-
- if (idx >= h->size) {
- if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
- mutex_lock(&c->ec_stripes_heap_lock);
- if (n.size > h->size) {
- memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
- n.nr = h->nr;
- swap(*h, n);
- }
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- free_heap(&n);
- }
-
- if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
if (c->gc_pos.phase != GC_PHASE_not_running &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
@@ -1031,155 +982,26 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
s->idx = 0;
}
-/* Heap of all existing stripes, ordered by blocks_nonempty */
-
-static u64 stripe_idx_to_delete(struct bch_fs *c)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
-
- lockdep_assert_held(&c->ec_stripes_heap_lock);
-
- if (h->nr &&
- h->data[0].blocks_nonempty == 0 &&
- !bch2_stripe_is_open(c, h->data[0].idx))
- return h->data[0].idx;
-
- return 0;
-}
-
-static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
- size_t i)
-{
- struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
-
- genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
-}
-
-static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
-{
- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
-
- return ((_l->blocks_nonempty > _r->blocks_nonempty) <
- (_l->blocks_nonempty < _r->blocks_nonempty));
-}
-
-static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
-{
- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
- ec_stripes_heap *_h = (ec_stripes_heap *)h;
- size_t i = _l - _h->data;
- size_t j = _r - _h->data;
-
- swap(*_l, *_r);
-
- ec_stripes_heap_set_backpointer(_h, i);
- ec_stripes_heap_set_backpointer(_h, j);
-}
-
-static const struct min_heap_callbacks callbacks = {
- .less = ec_stripes_heap_cmp,
- .swp = ec_stripes_heap_swap,
-};
-
-static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m = genradix_ptr(&c->stripes, idx);
-
- BUG_ON(m->heap_idx >= h->nr);
- BUG_ON(h->data[m->heap_idx].idx != idx);
-}
-
-void bch2_stripes_heap_del(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- mutex_lock(&c->ec_stripes_heap_lock);
- heap_verify_backpointer(c, idx);
-
- min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
- mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_stripes_heap_insert(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- mutex_lock(&c->ec_stripes_heap_lock);
- BUG_ON(min_heap_full(&c->ec_stripes_heap));
-
- genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
- min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
- .idx = idx,
- .blocks_nonempty = m->blocks_nonempty,
- }),
- &callbacks,
- &c->ec_stripes_heap);
-
- heap_verify_backpointer(c, idx);
- mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_stripes_heap_update(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- bool do_deletes;
- size_t i;
-
- mutex_lock(&c->ec_stripes_heap_lock);
- heap_verify_backpointer(c, idx);
-
- h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
-
- i = m->heap_idx;
- min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap);
- min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
-
- heap_verify_backpointer(c, idx);
-
- do_deletes = stripe_idx_to_delete(c) != 0;
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- if (do_deletes)
- bch2_do_stripe_deletes(c);
-}
-
/* stripe deletion */
static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
{
- struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_stripe s;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
- BTREE_ITER_intent);
- ret = bkey_err(k);
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+ BTREE_ID_stripes, POS(0, idx),
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (ret)
goto err;
- if (k.k->type != KEY_TYPE_stripe) {
- bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
- ret = -EINVAL;
- goto err;
- }
-
- s = bkey_s_c_to_stripe(k);
- for (unsigned i = 0; i < s.v->nr_blocks; i++)
- if (stripe_blockcount_get(s.v, i)) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
- printbuf_exit(&buf);
- ret = -EINVAL;
- goto err;
- }
-
- ret = bch2_btree_delete_at(trans, &iter, 0);
+ /*
+ * We expect write buffer races here
+ * Important: check stripe_is_open with stripe key locked:
+ */
+ if (k.k->type == KEY_TYPE_stripe &&
+ !bch2_stripe_is_open(trans->c, idx) &&
+ stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
+ ret = bch2_btree_delete_at(trans, &iter, 0);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -1194,21 +1016,16 @@ static void ec_stripe_delete_work(struct work_struct *work)
struct bch_fs *c =
container_of(work, struct bch_fs, ec_stripe_delete_work);
- while (1) {
- mutex_lock(&c->ec_stripes_heap_lock);
- u64 idx = stripe_idx_to_delete(c);
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- if (!idx)
- break;
-
- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_delete(trans, idx));
- bch_err_fn(c, ret);
- if (ret)
- break;
- }
-
+ bch2_trans_run(c,
+ bch2_btree_write_buffer_tryflush(trans) ?:
+ for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
+ 0, lru_k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc, ({
+ ec_stripe_delete(trans, lru_k.k->p.offset);
+ })));
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
}
@@ -1557,6 +1374,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ret)
goto err;
err:
+ trace_stripe_create(c, s->idx, ret);
+
bch2_disk_reservation_put(c, &s->res);
for (i = 0; i < v->nr_blocks; i++)
@@ -1998,39 +1817,40 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
return 0;
}
-static s64 get_existing_stripe(struct bch_fs *c,
- struct ec_stripe_head *head)
+static int __get_existing_stripe(struct btree_trans *trans,
+ struct ec_stripe_head *head,
+ struct ec_stripe_buf *stripe,
+ u64 idx)
{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m;
- size_t heap_idx;
- u64 stripe_idx;
- s64 ret = -1;
-
- if (may_create_new_stripe(c))
- return -1;
+ struct bch_fs *c = trans->c;
- mutex_lock(&c->ec_stripes_heap_lock);
- for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
- /* No blocks worth reusing, stripe will just be deleted: */
- if (!h->data[heap_idx].blocks_nonempty)
- continue;
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+ BTREE_ID_stripes, POS(0, idx), 0);
+ int ret = bkey_err(k);
+ if (ret)
+ goto err;
- stripe_idx = h->data[heap_idx].idx;
+ /* We expect write buffer races here */
+ if (k.k->type != KEY_TYPE_stripe)
+ goto out;
- m = genradix_ptr(&c->stripes, stripe_idx);
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+ if (stripe_lru_pos(s.v) <= 1)
+ goto out;
- if (m->disk_label == head->disk_label &&
- m->algorithm == head->algo &&
- m->nr_redundant == head->redundancy &&
- m->sectors == head->blocksize &&
- m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
- bch2_try_open_stripe(c, head->s, stripe_idx)) {
- ret = stripe_idx;
- break;
- }
+ if (s.v->disk_label == head->disk_label &&
+ s.v->algorithm == head->algo &&
+ s.v->nr_redundant == head->redundancy &&
+ le16_to_cpu(s.v->sectors) == head->blocksize &&
+ bch2_try_open_stripe(c, head->s, idx)) {
+ bkey_reassemble(&stripe->key, k);
+ ret = 1;
}
- mutex_unlock(&c->ec_stripes_heap_lock);
+out:
+ bch2_set_btree_iter_dontneed(&iter);
+err:
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -2082,24 +1902,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
struct ec_stripe_new *s)
{
struct bch_fs *c = trans->c;
- s64 idx;
- int ret;
/*
* If we can't allocate a new stripe, and there's no stripes with empty
* blocks for us to reuse, that means we have to wait on copygc:
*/
- idx = get_existing_stripe(c, h);
- if (idx < 0)
- return -BCH_ERR_stripe_alloc_blocked;
+ if (may_create_new_stripe(c))
+ return -1;
- ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
- bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
- "reading stripe key: %s", bch2_err_str(ret));
- if (ret) {
- bch2_stripe_close(c, s);
- return ret;
+ struct btree_iter lru_iter;
+ struct bkey_s_c lru_k;
+ int ret = 0;
+
+ for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
+ 0, lru_k, ret) {
+ ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
+ if (ret)
+ break;
}
+ bch2_trans_iter_exit(trans, &lru_iter);
+ if (!ret)
+ ret = -BCH_ERR_stripe_alloc_blocked;
+ if (ret == 1)
+ ret = 0;
+ if (ret)
+ return ret;
return init_new_stripe_from_existing(c, s);
}
@@ -2397,46 +2226,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
int bch2_stripes_read(struct bch_fs *c)
{
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- if (k.k->type != KEY_TYPE_stripe)
- continue;
-
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
- if (ret)
- break;
-
- struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
-
- stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
-
- bch2_stripes_heap_insert(c, m, k.k->p.offset);
- 0;
- })));
- bch_err_fn(c, ret);
- return ret;
-}
-
-void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m;
- size_t i;
-
- mutex_lock(&c->ec_stripes_heap_lock);
- for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
- m = genradix_ptr(&c->stripes, h->data[i].idx);
-
- prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
- h->data[i].blocks_nonempty,
- m->nr_blocks - m->nr_redundant,
- m->nr_redundant);
- if (bch2_stripe_is_open(c, h->data[i].idx))
- prt_str(out, " open");
- prt_newline(out);
- }
- mutex_unlock(&c->ec_stripes_heap_lock);
+ return 0;
}
static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
@@ -2507,15 +2297,12 @@ void bch2_fs_ec_exit(struct bch_fs *c)
BUG_ON(!list_empty(&c->ec_stripe_new_list));
- free_heap(&c->ec_stripes_heap);
- genradix_free(&c->stripes);
bioset_exit(&c->ec_bioset);
}
void bch2_fs_ec_init_early(struct bch_fs *c)
{
spin_lock_init(&c->ec_stripes_new_lock);
- mutex_init(&c->ec_stripes_heap_lock);
INIT_LIST_HEAD(&c->ec_stripe_head_list);
mutex_init(&c->ec_stripe_head_lock);
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
index cd1c837e..8f2228e5 100644
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@@ -99,15 +99,13 @@ static inline u64 stripe_lru_pos(const struct bch_stripe *s)
if (!s)
return 0;
- unsigned blocks_empty = 0, blocks_nonempty = 0;
+ unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0;
- for (unsigned i = 0; i < s->nr_blocks; i++) {
- blocks_empty += !stripe_blockcount_get(s, i);
- blocks_nonempty += !!stripe_blockcount_get(s, i);
- }
+ for (unsigned i = 0; i < nr_data; i++)
+ blocks_empty += !stripe_blockcount_get(s, i);
/* Will be picked up by the stripe_delete worker */
- if (!blocks_nonempty)
+ if (blocks_empty == nr_data)
return STRIPE_LRU_POS_EMPTY;
if (!blocks_empty)
@@ -260,10 +258,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
unsigned, unsigned, unsigned,
enum bch_watermark, struct closure *);
-void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
-void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
-void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
-
void bch2_do_stripe_deletes(struct bch_fs *);
void bch2_ec_do_stripe_creates(struct bch_fs *);
void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
@@ -300,7 +294,6 @@ void bch2_fs_ec_flush(struct bch_fs *);
int bch2_stripes_read(struct bch_fs *);
-void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_ec_exit(struct bch_fs *);
diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h
index 37558cc2..06144bfd 100644
--- a/libbcachefs/ec_types.h
+++ b/libbcachefs/ec_types.h
@@ -31,11 +31,4 @@ struct gc_stripe {
struct bch_replicas_padded r;
};
-struct ec_stripe_heap_entry {
- size_t idx;
- unsigned blocks_nonempty;
-};
-
-typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap;
-
#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index 89df9781..531fe575 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -119,6 +119,7 @@
x(ENOENT, ENOENT_dev_idx_not_found) \
x(ENOENT, ENOENT_inode_no_backpointer) \
x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
+ x(ENOENT, btree_node_dying) \
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
x(EEXIST, EEXIST_str_hash_set) \
@@ -185,6 +186,7 @@
x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \
x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \
+ x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \
x(EINVAL, device_state_not_allowed) \
x(EINVAL, member_info_missing) \
x(EINVAL, mismatched_block_size) \
@@ -205,6 +207,7 @@
x(EINVAL, no_resize_with_buckets_nouse) \
x(EINVAL, inode_unpack_error) \
x(EINVAL, varint_decode_error) \
+ x(EOPNOTSUPP, may_not_use_incompat_feature) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@@ -269,12 +272,29 @@
x(EIO, mark_stripe) \
x(EIO, stripe_reconstruct) \
x(EIO, key_type_error) \
+ x(EIO, extent_poisened) \
x(EIO, no_device_to_read_from) \
x(EIO, missing_indirect_extent) \
x(EIO, invalidate_stripe_to_dev) \
x(EIO, no_encryption_key) \
x(EIO, insufficient_journal_devices) \
x(EIO, device_offline) \
+ x(EIO, EIO_fault_injected) \
+ x(EIO, data_read) \
+ x(BCH_ERR_data_read, data_read_retry) \
+ x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_device_offline) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_io_err) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_ec_reconstruct_err) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_csum_err) \
+ x(BCH_ERR_data_read_retry, data_read_csum_err_maybe_userspace) \
+ x(BCH_ERR_data_read, data_read_decompress_err) \
+ x(BCH_ERR_data_read, data_read_decrypt_err) \
+ x(BCH_ERR_data_read, data_read_ptr_stale_race) \
+ x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \
+ x(BCH_ERR_data_read, data_read_no_encryption_key) \
+ x(BCH_ERR_data_read, data_read_buffer_too_small) \
+ x(BCH_ERR_data_read, data_read_key_overwritten) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 3f93a5a6..6d68c89a 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -54,25 +54,41 @@ void bch2_io_error_work(struct work_struct *work)
{
struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
struct bch_fs *c = ca->fs;
- bool dev;
+
+ /* XXX: if it's reads or checksums that are failing, set it to failed */
down_write(&c->state_lock);
- dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
- BCH_FORCE_IF_DEGRADED);
- if (dev
- ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
- BCH_FORCE_IF_DEGRADED)
- : bch2_fs_emergency_read_only(c))
+ unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
+
+ if (write_errors_start &&
+ time_after(jiffies,
+ write_errors_start + c->opts.write_error_timeout * HZ)) {
+ if (ca->mi.state >= BCH_MEMBER_STATE_ro)
+ goto out;
+
+ bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
+ BCH_FORCE_IF_DEGRADED);
+
bch_err(ca,
- "too many IO errors, setting %s RO",
+ "writes erroring for %u seconds, setting %s ro",
+ c->opts.write_error_timeout,
dev ? "device" : "filesystem");
+ if (!dev)
+ bch2_fs_emergency_read_only(c);
+
+ }
+out:
up_write(&c->state_lock);
}
void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
{
atomic64_inc(&ca->errors[type]);
- //queue_work(system_long_wq, &ca->io_error_work);
+
+ if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
+ ca->write_errors_start = jiffies;
+
+ queue_work(system_long_wq, &ca->io_error_work);
}
enum ask_yn {
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index b3cc69f2..7d3f0e2a 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -216,27 +216,37 @@ void bch2_io_error_work(struct work_struct *);
/* Does the error handling without logging a message */
void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
-#define bch2_dev_io_err_on(cond, ca, _type, ...) \
-({ \
- bool _ret = (cond); \
- \
- if (_ret) { \
- bch_err_dev_ratelimited(ca, __VA_ARGS__); \
- bch2_io_error(ca, _type); \
- } \
- _ret; \
-})
-
-#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \
-({ \
- bool _ret = (cond); \
- \
- if (_ret) { \
- bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \
- bch2_io_error(ca, _type); \
- } \
- _ret; \
-})
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+static inline void bch2_account_io_success_fail(struct bch_dev *ca,
+ enum bch_member_error_type type,
+ bool success)
+{
+ if (likely(success)) {
+ if (type == BCH_MEMBER_ERROR_write &&
+ ca->write_errors_start)
+ ca->write_errors_start = 0;
+ } else {
+ bch2_io_error(ca, type);
+ }
+}
+
+static inline void bch2_account_io_completion(struct bch_dev *ca,
+ enum bch_member_error_type type,
+ u64 submit_time, bool success)
+{
+ if (unlikely(!ca))
+ return;
+
+ if (type != BCH_MEMBER_ERROR_checksum)
+ bch2_latency_acct(ca, submit_time, type);
+
+ bch2_account_io_success_fail(ca, type, success);
+}
int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 78a51d96..f62ee96b 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -28,6 +28,13 @@
#include "trace.h"
#include "util.h"
+static const char * const bch2_extent_flags_strs[] = {
+#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
+ BCH_EXTENT_FLAGS()
+#undef x
+ NULL,
+};
+
static unsigned bch2_crc_field_size_max[] = {
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
@@ -51,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
}
void bch2_mark_io_failure(struct bch_io_failures *failed,
- struct extent_ptr_decoded *p)
+ struct extent_ptr_decoded *p,
+ bool csum_error)
{
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
@@ -59,25 +67,28 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
f = &failed->devs[failed->nr++];
- f->dev = p->ptr.dev;
- f->idx = p->idx;
- f->nr_failed = 1;
- f->nr_retries = 0;
- } else if (p->idx != f->idx) {
- f->idx = p->idx;
- f->nr_failed = 1;
- f->nr_retries = 0;
- } else {
- f->nr_failed++;
+ memset(f, 0, sizeof(*f));
+ f->dev = p->ptr.dev;
}
+
+ if (p->do_ec_reconstruct)
+ f->failed_ec = true;
+ else if (!csum_error)
+ f->failed_io = true;
+ else
+ f->failed_csum_nr++;
}
-static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
+static inline u64 dev_latency(struct bch_dev *ca)
{
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
}
+static inline int dev_failed(struct bch_dev *ca)
+{
+ return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
+}
+
/*
* returns true if p1 is better than p2:
*/
@@ -85,9 +96,18 @@ static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p1,
const struct extent_ptr_decoded p2)
{
- if (likely(!p1.idx && !p2.idx)) {
- u64 l1 = dev_latency(c, p1.ptr.dev);
- u64 l2 = dev_latency(c, p2.ptr.dev);
+ if (likely(!p1.do_ec_reconstruct &&
+ !p2.do_ec_reconstruct)) {
+ struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev);
+ struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
+
+ int failed_delta = dev_failed(ca1) - dev_failed(ca2);
+
+ if (failed_delta)
+ return failed_delta < 0;
+
+ u64 l1 = dev_latency(ca1);
+ u64 l2 = dev_latency(ca2);
/*
* Square the latencies, to bias more in favor of the faster
@@ -103,9 +123,9 @@ static inline bool ptr_better(struct bch_fs *c,
}
if (bch2_force_reconstruct_read)
- return p1.idx > p2.idx;
+ return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
- return p1.idx < p2.idx;
+ return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
}
/*
@@ -114,19 +134,24 @@ static inline bool ptr_better(struct bch_fs *c,
* other devices, it will still pick a pointer from avoid.
*/
int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
- struct bch_io_failures *failed,
- struct extent_ptr_decoded *pick,
- int dev)
+ struct bch_io_failures *failed,
+ struct extent_ptr_decoded *pick,
+ int dev)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
struct bch_dev_io_failures *f;
+ unsigned csum_retry = 0;
+ bool have_csum_retries = false;
int ret = 0;
if (k.k->type == KEY_TYPE_error)
return -BCH_ERR_key_type_error;
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BCH_EXTENT_FLAG_poisoned)
+ return -BCH_ERR_extent_poisened;
+again:
rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
/*
@@ -154,20 +179,28 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
- f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
- if (f)
- p.idx = f->nr_failed < f->nr_retries
- ? f->idx
- : f->idx + 1;
-
- if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
- p.idx++;
+ if (unlikely(failed) &&
+ (f = bch2_dev_io_failures(failed, p.ptr.dev))) {
+ have_csum_retries |= !f->failed_io && f->failed_csum_nr < BCH_MAX_CSUM_RETRIES;
+
+ if (p.has_ec &&
+ !f->failed_ec &&
+ (f->failed_io || f->failed_csum_nr))
+ p.do_ec_reconstruct = true;
+ else if (f->failed_io ||
+ f->failed_csum_nr > csum_retry)
+ continue;
+ }
- if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
- p.idx++;
+ if (!ca || !bch2_dev_is_online(ca)) {
+ if (p.has_ec)
+ p.do_ec_reconstruct = true;
+ else
+ continue;
+ }
- if (p.idx > (unsigned) p.has_ec)
- continue;
+ if (p.has_ec && bch2_force_reconstruct_read)
+ p.do_ec_reconstruct = true;
if (ret > 0 && !ptr_better(c, p, *pick))
continue;
@@ -177,6 +210,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
}
rcu_read_unlock();
+ if (unlikely(ret == -BCH_ERR_no_device_to_read_from &&
+ have_csum_retries &&
+ csum_retry < BCH_MAX_CSUM_RETRIES)) {
+ csum_retry++;
+ goto again;
+ }
+
return ret;
}
@@ -1002,7 +1042,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
- return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
+ return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
}
void bch2_extent_ptr_set_cached(struct bch_fs *c,
@@ -1225,6 +1265,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
break;
+ case BCH_EXTENT_ENTRY_flags:
+ prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
+ break;
+
default:
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
@@ -1386,6 +1430,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
#endif
break;
}
+ case BCH_EXTENT_ENTRY_flags:
+ bkey_fsck_err_on(entry != ptrs.start,
+ c, extent_flags_not_at_start,
+ "extent flags entry not at start");
+ break;
}
}
@@ -1452,6 +1501,28 @@ void bch2_ptr_swab(struct bkey_s k)
}
}
+int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
+{
+ int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
+ if (ret)
+ return ret;
+
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+
+ if (ptrs.start != ptrs.end &&
+ extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
+ ptrs.start->flags.flags = flags;
+ } else {
+ struct bch_extent_flags f = {
+ .type = BIT(BCH_EXTENT_ENTRY_flags),
+ .flags = flags,
+ };
+ __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
+ }
+
+ return 0;
+}
+
/* Generic extent code: */
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
@@ -1497,8 +1568,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
entry->crc128.offset += sub;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
- break;
case BCH_EXTENT_ENTRY_rebalance:
+ case BCH_EXTENT_ENTRY_flags:
break;
}
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 8fae6b23..b4058502 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -320,8 +320,8 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
({ \
__label__ out; \
\
- (_ptr).idx = 0; \
(_ptr).has_ec = false; \
+ (_ptr).do_ec_reconstruct = false; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
switch (__extent_entry_type(_entry)) { \
@@ -401,7 +401,7 @@ out: \
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
unsigned);
void bch2_mark_io_failure(struct bch_io_failures *,
- struct extent_ptr_decoded *);
+ struct extent_ptr_decoded *, bool);
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
struct bch_io_failures *,
struct extent_ptr_decoded *, int);
@@ -704,7 +704,7 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
ptr1.unwritten == ptr2.unwritten &&
ptr1.offset == ptr2.offset &&
ptr1.dev == ptr2.dev &&
- ptr1.dev == ptr2.dev);
+ ptr1.gen == ptr2.gen);
}
void bch2_ptr_swab(struct bkey_s);
@@ -753,4 +753,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
k->size = new_size;
}
+static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs)
+{
+ if (ptrs.start != ptrs.end &&
+ extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags)
+ return ptrs.start->flags.flags;
+ return 0;
+}
+
+static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k)
+{
+ return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k));
+}
+
+int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64);
+
#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/libbcachefs/extents_format.h b/libbcachefs/extents_format.h
index c198dfc3..74c0252c 100644
--- a/libbcachefs/extents_format.h
+++ b/libbcachefs/extents_format.h
@@ -79,8 +79,9 @@
x(crc64, 2) \
x(crc128, 3) \
x(stripe_ptr, 4) \
- x(rebalance, 5)
-#define BCH_EXTENT_ENTRY_MAX 6
+ x(rebalance, 5) \
+ x(flags, 6)
+#define BCH_EXTENT_ENTRY_MAX 7
enum bch_extent_entry_type {
#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr {
#endif
};
+#define BCH_EXTENT_FLAGS() \
+ x(poisoned, 0)
+
+enum bch_extent_flags_e {
+#define x(n, v) BCH_EXTENT_FLAG_##n = v,
+ BCH_EXTENT_FLAGS()
+#undef x
+};
+
+struct bch_extent_flags {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:7,
+ flags:57;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 flags:57,
+ type:7;
+#endif
+};
+
/* bch_extent_rebalance: */
#include "rebalance_format.h"
diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h
index 43d6c341..f8b8e598 100644
--- a/libbcachefs/extents_types.h
+++ b/libbcachefs/extents_types.h
@@ -20,21 +20,23 @@ struct bch_extent_crc_unpacked {
};
struct extent_ptr_decoded {
- unsigned idx;
bool has_ec;
+ unsigned do_ec_reconstruct;
struct bch_extent_crc_unpacked crc;
struct bch_extent_ptr ptr;
struct bch_extent_stripe_ptr ec;
};
+#define BCH_MAX_CSUM_RETRIES 3
+
struct bch_io_failures {
u8 nr;
struct bch_dev_io_failures {
u8 dev;
- u8 idx;
- u8 nr_failed;
- u8 nr_retries;
- } devs[BCH_REPLICAS_MAX];
+ unsigned failed_csum_nr:4,
+ failed_io:1,
+ failed_ec:1;
+ } devs[BCH_REPLICAS_MAX + 1];
};
#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c
index ca70a3de..fbc3da59 100644
--- a/libbcachefs/fs-common.c
+++ b/libbcachefs/fs-common.c
@@ -268,16 +268,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
dir_hash = bch2_hash_info_init(c, dir_u);
- struct bkey_s_c dirent_k =
- bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
- &dir_hash, dir, name, BTREE_ITER_intent);
- ret = bkey_err(dirent_k);
- if (ret)
- goto err;
-
- ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(dirent_k), &inum);
- if (ret > 0)
- ret = -ENOENT;
+ ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+ name, &inum, BTREE_ITER_intent);
if (ret)
goto err;
@@ -334,7 +326,6 @@ int bch2_unlink_trans(struct btree_trans *trans,
dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
- dir_u->bi_size -= bkey_bytes(dirent_k.k);
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
&dir_hash, &dirent_iter,
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 94bf34b9..717e7b94 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -466,6 +466,7 @@ int bchfs_truncate(struct mnt_idmap *idmap,
ret = bch2_truncate_folio(inode, iattr->ia_size);
if (unlikely(ret < 0))
goto err;
+ ret = 0;
truncate_setsize(&inode->v, iattr->ia_size);
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index 4465a2a8..5b47b94f 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -69,8 +69,9 @@ static int bch2_inode_flags_set(struct btree_trans *trans,
if (ret < 0)
return ret;
- if (!bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding))
- return -EOPNOTSUPP;
+ ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding);
+ if (ret)
+ return ret;
bch2_check_set_feature(c, BCH_FEATURE_casefolding);
#else
@@ -243,7 +244,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
int ret = 0;
subvol_inum inum;
- kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
+ kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL);
if (!kname)
return -ENOMEM;
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 2c011a46..459ca825 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -2218,9 +2218,10 @@ static int bch2_fs_get_tree(struct fs_context *fc)
bch2_opts_apply(&c->opts, opts);
- ret = bch2_fs_start(c);
- if (ret)
- goto err_stop_fs;
+ /*
+ * need to initialise sb and set c->vfs_sb _before_ starting fs,
+ * for blk_holder_ops
+ */
sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
ret = PTR_ERR_OR_ZERO(sb);
@@ -2282,6 +2283,10 @@ got_sb:
sb->s_shrink->seeks = 0;
+ ret = bch2_fs_start(c);
+ if (ret)
+ goto err_put_super;
+
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
bch_err_msg(c, ret, "mounting: error getting root inode");
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 9bf316e7..0e85131d 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -1978,31 +1978,10 @@ fsck_err:
return ret;
}
-static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- darray_for_each(w->inodes, i)
- if (fsck_err_on(i->inode.bi_size != i->i_size,
- trans, inode_dir_wrong_nlink,
- "directory %llu:%u with wrong i_size: got %llu, should be %llu",
- w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) {
- i->inode.bi_size = i->i_size;
- ret = bch2_fsck_write_inode(trans, &i->inode);
- if (ret)
- break;
- }
-fsck_err:
- bch_err_fn(c, ret);
- return ret;
-}
-
static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
{
u32 restart_count = trans->restart_count;
return check_subdir_count_notnested(trans, w) ?:
- check_dir_i_size_notnested(trans, w) ?:
trans_was_restarted(trans, restart_count);
}
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index 821ff222..652dbc58 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -329,10 +329,17 @@ nopromote:
static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
struct bch_read_bio *rbio, struct bpos read_pos)
{
- return lockrestart_do(trans,
+ int ret = lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, out,
(subvol_inum) { rbio->subvol, read_pos.inode },
read_pos.offset << 9));
+ if (ret)
+ return ret;
+
+ if (rbio->flags & BCH_READ_data_update)
+ prt_str(out, "(internal move) ");
+
+ return 0;
}
static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
@@ -341,10 +348,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
}
-#define READ_RETRY_AVOID 1
-#define READ_RETRY 2
-#define READ_ERR 3
-
enum rbio_context {
RBIO_CONTEXT_NULL,
RBIO_CONTEXT_HIGHPRI,
@@ -375,6 +378,11 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
{
BUG_ON(rbio->bounce && !rbio->split);
+ if (rbio->have_ioref) {
+ struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
+ percpu_ref_put(&ca->io_ref);
+ }
+
if (rbio->split) {
struct bch_read_bio *parent = rbio->parent;
@@ -408,13 +416,90 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
bio_endio(&rbio->bio);
}
-static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter,
- struct bch_io_failures *failed,
- unsigned flags)
+static struct bkey_s_c get_rbio_extent(struct btree_trans *trans,
+ struct bch_read_bio *rbio,
+ struct btree_iter *iter)
+{
+ if (rbio->flags & BCH_READ_data_update) {
+ struct data_update *u = container_of(rbio, struct data_update, rbio);
+
+ return bch2_bkey_get_iter(trans, iter,
+ u->btree_id, bkey_start_pos(&u->k.k->k), 0);
+ } else {
+ struct bpos pos = rbio->read_pos;
+ int ret = bch2_subvolume_get_snapshot(trans, rbio->subvol, &pos.snapshot);
+ if (ret)
+ return bkey_s_c_err(ret);
+
+ return bch2_bkey_get_iter(trans, iter,
+ BTREE_ID_extents, pos, 0);
+ }
+}
+
+static void mark_io_failure_if_current_extent_matches(struct btree_trans *trans,
+ struct bch_read_bio *rbio,
+ struct bch_io_failures *failed)
+{
+ struct btree_iter iter = {};
+ struct bkey_s_c k;
+ int ret = lockrestart_do(trans,
+ bkey_err(k = get_rbio_extent(trans, rbio, &iter)));
+
+ if (!ret) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr))
+ bch2_mark_io_failure(failed, &rbio->pick,
+ rbio->ret == -BCH_ERR_data_read_csum_err);
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+}
+
+static noinline int maybe_poison_extent(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k, struct bch_io_failures *failed)
+{
+ u64 flags = bch2_bkey_extent_flags(k);
+ if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+ return 0;
+
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ /*
+ * Make sure we actually attempt to read and got checksum failures from
+ * every replica
+ */
+
+ rcu_read_lock();
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
+ continue;
+
+ struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, ptr->dev);
+ if (!f || f->failed_csum_nr != BCH_MAX_CSUM_RETRIES) {
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ struct bkey_i *new = __bch2_bkey_make_mut(trans, iter, &k, 0, 0,
+ bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
+ return PTR_ERR_OR_ZERO(new) ?:
+ bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+}
+
+static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
+ struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter,
+ struct bch_io_failures *failed,
+ unsigned flags)
{
struct data_update *u = container_of(rbio, struct data_update, rbio);
- struct btree_trans *trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
@@ -429,7 +514,7 @@ retry:
if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
/* extent we wanted to read no longer exists: */
- rbio->hole = true;
+ rbio->ret = -BCH_ERR_data_read_key_overwritten;
goto err;
}
@@ -441,14 +526,19 @@ retry:
err:
bch2_trans_iter_exit(trans, &iter);
- if (ret == READ_RETRY)
+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
goto retry;
- if (ret)
- rbio->bio.bi_status = BLK_STS_IOERR;
+
+ if (ret) {
+ if (ret == -BCH_ERR_no_device_to_read_from && failed)
+ maybe_poison_extent(trans, &iter, k, failed);
+
+ rbio->bio.bi_status = BLK_STS_IOERR;
+ rbio->ret = ret;
+ }
BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
- bch2_rbio_done(rbio);
- bch2_trans_put(trans);
+ return ret;
}
static void bch2_rbio_retry(struct work_struct *work)
@@ -463,16 +553,22 @@ static void bch2_rbio_retry(struct work_struct *work)
.inum = rbio->read_pos.inode,
};
struct bch_io_failures failed = { .nr = 0 };
+ struct btree_trans *trans = bch2_trans_get(c);
trace_io_read_retry(&rbio->bio);
this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
bvec_iter_sectors(rbio->bvec_iter));
- if (rbio->retry == READ_RETRY_AVOID)
- bch2_mark_io_failure(&failed, &rbio->pick);
+ if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
+ mark_io_failure_if_current_extent_matches(trans, rbio, &failed);
- if (!rbio->split)
- rbio->bio.bi_status = 0;
+ if (!rbio->split) {
+ rbio->bio.bi_status = 0;
+ rbio->ret = 0;
+ }
+
+ unsigned subvol = rbio->subvol;
+ struct bpos read_pos = rbio->read_pos;
rbio = bch2_rbio_free(rbio);
@@ -481,29 +577,55 @@ static void bch2_rbio_retry(struct work_struct *work)
flags &= ~BCH_READ_last_fragment;
flags |= BCH_READ_must_clone;
- if (flags & BCH_READ_data_update)
- bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
- else
- __bch2_read(c, rbio, iter, inum, &failed, flags);
+ int ret = flags & BCH_READ_data_update
+ ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
+ : __bch2_read(trans, rbio, iter, inum, &failed, flags);
+
+ if (ret) {
+ rbio->ret = ret;
+ rbio->bio.bi_status = BLK_STS_IOERR;
+ } else {
+ struct printbuf buf = PRINTBUF;
+
+ lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, &buf,
+ (subvol_inum) { subvol, read_pos.inode },
+ read_pos.offset << 9));
+ if (rbio->flags & BCH_READ_data_update)
+ prt_str(&buf, "(internal move) ");
+ prt_str(&buf, "successful retry");
+
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ bch2_rbio_done(rbio);
+ bch2_trans_put(trans);
}
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
- blk_status_t error)
+static void bch2_rbio_error(struct bch_read_bio *rbio,
+ int ret, blk_status_t blk_error)
{
- rbio->retry = retry;
- rbio->saw_error = true;
+ BUG_ON(ret >= 0);
+
+ rbio->ret = ret;
+ rbio->bio.bi_status = blk_error;
+
+ bch2_rbio_parent(rbio)->saw_error = true;
if (rbio->flags & BCH_READ_in_retry)
return;
- if (retry == READ_ERR) {
+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
+ bch2_rbio_punt(rbio, bch2_rbio_retry,
+ RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+ } else {
rbio = bch2_rbio_free(rbio);
- rbio->bio.bi_status = error;
+ rbio->ret = ret;
+ rbio->bio.bi_status = blk_error;
+
bch2_rbio_done(rbio);
- } else {
- bch2_rbio_punt(rbio, bch2_rbio_retry,
- RBIO_CONTEXT_UNBOUND, system_unbound_wq);
}
}
@@ -519,15 +641,13 @@ static void bch2_read_io_err(struct work_struct *work)
bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
- if (ca) {
- bch2_io_error(ca, BCH_MEMBER_ERROR_read);
+ if (ca)
bch_err_ratelimited(ca, "%s", buf.buf);
- } else {
+ else
bch_err_ratelimited(c, "%s", buf.buf);
- }
printbuf_exit(&buf);
- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_io_err, bio->bi_status);
}
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
@@ -609,14 +729,12 @@ static void bch2_read_csum_err(struct work_struct *work)
bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- if (ca) {
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ if (ca)
bch_err_ratelimited(ca, "%s", buf.buf);
- } else {
+ else
bch_err_ratelimited(c, "%s", buf.buf);
- }
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
@@ -636,7 +754,7 @@ static void bch2_read_decompress_err(struct work_struct *work)
else
bch_err_ratelimited(c, "%s", buf.buf);
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
@@ -656,16 +774,53 @@ static void bch2_read_decrypt_err(struct work_struct *work)
else
bch_err_ratelimited(c, "%s", buf.buf);
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_read_corrupt_ratio;
+module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(read_corrupt_ratio, "");
+
+static void corrupt_bio(struct bio *bio)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
+
+ bio_for_each_segment(bv, bio, iter) {
+ unsigned u64s = bv.bv_len / sizeof(u64);
+
+ if (offset < u64s) {
+ u64 *segment = bvec_kmap_local(&bv);
+ segment[offset] = get_random_u64();
+ kunmap_local(segment);
+ return;
+ }
+ offset -= u64s;
+ }
+}
+
+static inline void maybe_corrupt_bio(struct bio *bio)
+{
+ if (bch2_read_corrupt_ratio &&
+ !get_random_u32_below(bch2_read_corrupt_ratio))
+ corrupt_bio(bio);
+}
+#else
+static inline void maybe_corrupt_bio(struct bio *bio)
+{
+}
+#endif
+
/* Inner part that may run in process context */
static void __bch2_read_endio(struct work_struct *work)
{
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
struct bio *src = &rbio->bio;
struct bio *dst = &bch2_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
@@ -686,8 +841,26 @@ static void __bch2_read_endio(struct work_struct *work)
src->bi_iter = rbio->bvec_iter;
}
+ maybe_corrupt_bio(src);
+
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
- if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+ bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
+
+ /*
+ * Checksum error: if the bio wasn't bounced, we may have been
+ * reading into buffers owned by userspace (that userspace can
+ * scribble over) - retry the read, bouncing it this time:
+ */
+ if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
+ rbio->flags |= BCH_READ_must_bounce;
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err_maybe_userspace,
+ BLK_STS_IOERR);
+ goto out;
+ }
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
+
+ if (!csum_good)
goto csum_err;
/*
@@ -760,17 +933,6 @@ out:
memalloc_nofs_restore(nofs_flags);
return;
csum_err:
- /*
- * Checksum error: if the bio wasn't bounced, we may have been
- * reading into buffers owned by userspace (that userspace can
- * scribble over) - retry the read, bouncing it this time:
- */
- if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
- rbio->flags |= BCH_READ_must_bounce;
- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
- goto out;
- }
-
bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
goto out;
decompression_err:
@@ -790,10 +952,8 @@ static void bch2_read_endio(struct bio *bio)
struct workqueue_struct *wq = NULL;
enum rbio_context context = RBIO_CONTEXT_NULL;
- if (rbio->have_ioref) {
- bch2_latency_acct(ca, rbio->submit_time, READ);
- percpu_ref_put(&ca->io_ref);
- }
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ rbio->submit_time, !bio->bi_status);
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
@@ -808,9 +968,9 @@ static void bch2_read_endio(struct bio *bio)
trace_and_count(c, io_read_reuse_race, &rbio->bio);
if (rbio->flags & BCH_READ_retry_if_stale)
- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
else
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
return;
}
@@ -883,7 +1043,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bch_read_bio *rbio = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
- int pick_ret;
+ int ret = 0;
if (bkey_extent_is_inline_data(k.k)) {
unsigned bytes = min_t(unsigned, iter.bi_size,
@@ -899,16 +1059,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
goto out_read_done;
}
retry_pick:
- pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
+ ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
/* hole or reservation - just zero fill: */
- if (!pick_ret)
+ if (!ret)
goto hole;
- if (unlikely(pick_ret < 0)) {
+ if (unlikely(ret < 0)) {
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
- prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret));
+ prt_printf(&buf, "%s\n ", bch2_err_str(ret));
bch2_bkey_val_to_text(&buf, c, k);
bch_err_ratelimited(c, "%s", buf.buf);
@@ -924,6 +1084,7 @@ retry_pick:
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
+ ret = -BCH_ERR_data_read_no_encryption_key;
goto err;
}
@@ -940,7 +1101,7 @@ retry_pick:
ca &&
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
- bch2_mark_io_failure(failed, &pick);
+ bch2_mark_io_failure(failed, &pick, false);
percpu_ref_put(&ca->io_ref);
goto retry_pick;
}
@@ -984,10 +1145,10 @@ retry_pick:
*/
struct data_update *u = container_of(orig, struct data_update, rbio);
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
- BUG();
if (ca)
percpu_ref_put(&ca->io_ref);
- goto hole;
+ rbio->ret = -BCH_ERR_data_read_buffer_too_small;
+ goto out_read_done;
}
iter.bi_size = pick.crc.compressed_size << 9;
@@ -1067,8 +1228,7 @@ retry_pick:
rbio->flags = flags;
rbio->have_ioref = ca != NULL;
rbio->narrow_crcs = narrow_crcs;
- rbio->hole = 0;
- rbio->retry = 0;
+ rbio->ret = 0;
rbio->context = 0;
rbio->pick = pick;
rbio->subvol = orig->subvol;
@@ -1104,7 +1264,7 @@ retry_pick:
trace_and_count(c, io_read_split, &orig->bio);
}
- if (!rbio->pick.idx) {
+ if (likely(!rbio->pick.do_ec_reconstruct)) {
if (unlikely(!rbio->have_ioref)) {
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
@@ -1114,7 +1274,9 @@ retry_pick:
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ bch2_rbio_error(rbio,
+ -BCH_ERR_data_read_device_offline,
+ BLK_STS_IOERR);
goto out;
}
@@ -1140,7 +1302,8 @@ retry_pick:
} else {
/* Attempting reconstruct read: */
if (bch2_ec_read_extent(trans, rbio, k)) {
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ec_reconstruct_err,
+ BLK_STS_IOERR);
goto out;
}
@@ -1156,25 +1319,22 @@ out:
rbio->context = RBIO_CONTEXT_UNBOUND;
bch2_read_endio(&rbio->bio);
- ret = rbio->retry;
+ ret = rbio->ret;
rbio = bch2_rbio_free(rbio);
- if (ret == READ_RETRY_AVOID) {
- bch2_mark_io_failure(failed, &pick);
- ret = READ_RETRY;
- }
-
- if (!ret)
- goto out_read_done;
+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
+ bch2_mark_io_failure(failed, &pick,
+ ret == -BCH_ERR_data_read_csum_err);
return ret;
}
err:
if (flags & BCH_READ_in_retry)
- return READ_ERR;
+ return ret;
- orig->bio.bi_status = BLK_STS_IOERR;
+ orig->bio.bi_status = BLK_STS_IOERR;
+ orig->ret = ret;
goto out_read_done;
hole:
@@ -1186,20 +1346,21 @@ hole:
* to read no longer exists we have to signal that:
*/
if (flags & BCH_READ_data_update)
- orig->hole = true;
+ orig->ret = -BCH_ERR_data_read_key_overwritten;
zero_fill_bio_iter(&orig->bio, iter);
out_read_done:
- if (flags & BCH_READ_last_fragment)
+ if ((flags & BCH_READ_last_fragment) &&
+ !(flags & BCH_READ_in_retry))
bch2_rbio_done(orig);
return 0;
}
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, subvol_inum inum,
- struct bch_io_failures *failed, unsigned flags)
+int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, subvol_inum inum,
+ struct bch_io_failures *failed, unsigned flags)
{
- struct btree_trans *trans = bch2_trans_get(c);
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
@@ -1232,6 +1393,23 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
if (ret)
goto err;
+ if (unlikely(flags & BCH_READ_in_retry)) {
+ struct data_update *u = flags & BCH_READ_data_update
+ ? container_of(rbio, struct data_update, rbio)
+ : NULL;
+
+ if (u &&
+ !bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
+ /* extent we wanted to read no longer exists: */
+ ret = -BCH_ERR_data_read_key_overwritten;
+ goto err;
+ }
+
+ if (!bkey_deleted(&sk.k->k) &&
+ !bkey_and_val_eq(k, bkey_i_to_s_c(sk.k)))
+ failed->nr = 0;
+ }
+
s64 offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
unsigned sectors = k.k->size - offset_into_extent;
@@ -1271,28 +1449,32 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
err:
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
- ret != READ_RETRY &&
- ret != READ_RETRY_AVOID)
+ !bch2_err_matches(ret, BCH_ERR_data_read_retry))
break;
}
- bch2_trans_iter_exit(trans, &iter);
+ if (unlikely(ret)) {
+ if (ret == -BCH_ERR_no_device_to_read_from && failed)
+ maybe_poison_extent(trans, &iter, k, failed);
- if (ret) {
struct printbuf buf = PRINTBUF;
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf, inum,
bvec_iter.bi_sector << 9));
- prt_printf(&buf, "read error %i from btree lookup", ret);
+ prt_printf(&buf, "read error: %s", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
- rbio->bio.bi_status = BLK_STS_IOERR;
- bch2_rbio_done(rbio);
+ rbio->bio.bi_status = BLK_STS_IOERR;
+ rbio->ret = ret;
+
+ if (!(flags & BCH_READ_in_retry))
+ bch2_rbio_done(rbio);
}
- bch2_trans_put(trans);
+ bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c);
+ return ret;
}
void bch2_fs_io_read_exit(struct bch_fs *c)
diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h
index 73275da5..edcf50a4 100644
--- a/libbcachefs/io_read.h
+++ b/libbcachefs/io_read.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_IO_READ_H
#include "bkey_buf.h"
+#include "btree_iter.h"
#include "reflink.h"
struct bch_read_bio {
@@ -40,13 +41,12 @@ struct bch_read_bio {
split:1,
have_ioref:1,
narrow_crcs:1,
- hole:1,
saw_error:1,
- retry:2,
context:2;
};
u16 _state;
};
+ s16 ret;
struct extent_ptr_decoded pick;
@@ -141,22 +141,21 @@ static inline void bch2_read_extent(struct btree_trans *trans,
data_btree, k, offset_into_extent, NULL, flags, -1);
}
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- subvol_inum, struct bch_io_failures *, unsigned flags);
+int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
+ subvol_inum, struct bch_io_failures *, unsigned flags);
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
subvol_inum inum)
{
- struct bch_io_failures failed = { .nr = 0 };
-
BUG_ON(rbio->_state);
rbio->subvol = inum.subvol;
- __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
- BCH_READ_retry_if_stale|
- BCH_READ_may_promote|
- BCH_READ_user_mapped);
+ bch2_trans_run(c,
+ __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL,
+ BCH_READ_retry_if_stale|
+ BCH_READ_may_promote|
+ BCH_READ_user_mapped));
}
static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
@@ -166,6 +165,7 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
rbio->c = orig->c;
rbio->_state = 0;
+ rbio->ret = 0;
rbio->split = true;
rbio->parent = orig;
rbio->opts = orig->opts;
@@ -182,6 +182,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
rbio->start_time = local_clock();
rbio->c = c;
rbio->_state = 0;
+ rbio->ret = 0;
rbio->opts = opts;
rbio->bio.bi_end_io = end_io;
return rbio;
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index 738bdbfb..dbfcb28f 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -716,11 +716,15 @@ static void bch2_write_endio(struct bio *bio)
? bch2_dev_have_ref(c, wbio->dev)
: NULL;
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+ wbio->submit_time, !bio->bi_status);
+
+ if (bio->bi_status) {
+ bch_err_inum_offset_ratelimited(ca,
op->pos.inode,
wbio->inode_offset << 9,
"data write error: %s",
- bch2_blk_status_to_str(bio->bi_status))) {
+ bch2_blk_status_to_str(bio->bi_status));
set_bit(wbio->dev, op->failed.d);
op->flags |= BCH_WRITE_io_error;
}
@@ -732,10 +736,8 @@ static void bch2_write_endio(struct bio *bio)
set_bit(wbio->dev, op->devs_need_flush->d);
}
- if (wbio->have_ioref) {
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
+ if (wbio->have_ioref)
percpu_ref_put(&ca->io_ref);
- }
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h
index bf942566..62773053 100644
--- a/libbcachefs/io_write.h
+++ b/libbcachefs/io_write.h
@@ -11,12 +11,6 @@
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *, bool);
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 3d097de8..8d4f3bfa 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -1096,8 +1096,8 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
/* allocate journal on a device: */
-static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
- bool new_fs, struct closure *cl)
+static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
+ bool new_fs, struct closure *cl)
{
struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
@@ -1225,26 +1225,20 @@ err_free:
return ret;
}
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr)
+static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca,
+ unsigned nr, bool new_fs)
{
struct journal_device *ja = &ca->journal;
- struct closure cl;
int ret = 0;
+ struct closure cl;
closure_init_stack(&cl);
- down_write(&c->state_lock);
-
/* don't handle reducing nr of buckets yet: */
if (nr < ja->nr)
- goto unlock;
+ return 0;
- while (ja->nr < nr) {
+ while (!ret && ja->nr < nr) {
struct disk_reservation disk_res = { 0, 0, 0 };
/*
@@ -1257,25 +1251,38 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
* filesystem-wide allocation will succeed, this is a device
* specific allocation - we can hang here:
*/
+ if (!new_fs) {
+ ret = bch2_disk_reservation_get(c, &disk_res,
+ bucket_to_sector(ca, nr - ja->nr), 1, 0);
+ if (ret)
+ break;
+ }
- ret = bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 1, 0);
- if (ret)
- break;
+ ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl);
- ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+ if (ret == -BCH_ERR_bucket_alloc_blocked ||
+ ret == -BCH_ERR_open_buckets_empty)
+ ret = 0; /* wait and retry */
bch2_disk_reservation_put(c, &disk_res);
-
closure_sync(&cl);
-
- if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
- break;
}
- bch_err_fn(c, ret);
-unlock:
+ return ret;
+}
+
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+ unsigned nr)
+{
+ down_write(&c->state_lock);
+ int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false);
up_write(&c->state_lock);
+
+ bch_err_fn(c, ret);
return ret;
}
@@ -1301,7 +1308,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
min(1 << 13,
(1 << 24) / ca->mi.bucket_size));
- ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL);
+ ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs);
err:
bch_err_fn(ca, ret);
return ret;
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 7d59ccc0..331c9d76 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1041,13 +1041,19 @@ reread:
bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, buf->data, sectors_read << 9);
+ u64 submit_time = local_clock();
ret = submit_bio_wait(bio);
kfree(bio);
- if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
- "journal read error: sector %llu",
- offset) ||
- bch2_meta_read_fault("journal")) {
+ if (!ret && bch2_meta_read_fault("journal"))
+ ret = -BCH_ERR_EIO_fault_injected;
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ submit_time, !ret);
+
+ if (ret) {
+ bch_err_dev_ratelimited(ca,
+ "journal read error: sector %llu", offset);
/*
* We don't error out of the recovery process
* here, since the relevant journal entry may be
@@ -1110,13 +1116,16 @@ reread:
struct bch_csum csum;
csum_good = jset_csum_good(c, j, &csum);
- if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
- "%s",
- (printbuf_reset(&err),
- prt_str(&err, "journal "),
- bch2_csum_err_msg(&err, csum_type, j->csum, csum),
- err.buf)))
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
+
+ if (!csum_good) {
+ bch_err_dev_ratelimited(ca, "%s",
+ (printbuf_reset(&err),
+ prt_str(&err, "journal "),
+ bch2_csum_err_msg(&err, csum_type, j->csum, csum),
+ err.buf));
saw_bad = true;
+ }
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
@@ -1655,6 +1664,7 @@ static CLOSURE_CALLBACK(journal_write_done)
}
bool completed = false;
+ bool do_discards = false;
for (seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
@@ -1667,7 +1677,6 @@ static CLOSURE_CALLBACK(journal_write_done)
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
- bch2_do_discards(c);
closure_wake_up(&c->freelist_wait);
bch2_reset_alloc_cursors(c);
}
@@ -1718,6 +1727,9 @@ static CLOSURE_CALLBACK(journal_write_done)
*/
bch2_journal_do_writes(j);
spin_unlock(&j->lock);
+
+ if (do_discards)
+ bch2_do_discards(c);
}
static void journal_write_endio(struct bio *bio)
@@ -1727,13 +1739,16 @@ static void journal_write_endio(struct bio *bio)
struct journal *j = &ca->fs->journal;
struct journal_buf *w = j->buf + jbio->buf_idx;
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+ jbio->submit_time, !bio->bi_status);
+
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca,
"error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
- bch2_blk_status_to_str(bio->bi_status)) ||
- bch2_meta_write_fault("journal")) {
- unsigned long flags;
+ bch2_blk_status_to_str(bio->bi_status));
+ unsigned long flags;
spin_lock_irqsave(&j->err_lock, flags);
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
spin_unlock_irqrestore(&j->err_lock, flags);
@@ -1762,7 +1777,11 @@ static CLOSURE_CALLBACK(journal_write_submit)
sectors);
struct journal_device *ja = &ca->journal;
- struct bio *bio = &ja->bio[w->idx]->bio;
+ struct journal_bio *jbio = ja->bio[w->idx];
+ struct bio *bio = &jbio->bio;
+
+ jbio->submit_time = local_clock();
+
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
@@ -1794,6 +1813,10 @@ static CLOSURE_CALLBACK(journal_write_preflush)
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ /*
+ * Wait for previous journal writes to comelete; they won't necessarily
+ * be flushed if they're still in flight
+ */
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
spin_lock(&j->lock);
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index a0b17c6e..fd82f5d8 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -175,6 +175,7 @@ typedef DARRAY(u64) darray_u64;
struct journal_bio {
struct bch_dev *ca;
unsigned buf_idx;
+ u64 submit_time;
struct bio bio;
};
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index e944f279..a3096e2a 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -125,8 +125,8 @@ static void move_write(struct moving_io *io)
&ctxt->stats->sectors_error_corrected);
}
- if (unlikely(io->write.rbio.bio.bi_status ||
- io->write.rbio.hole ||
+ if (unlikely(io->write.rbio.ret ||
+ io->write.rbio.bio.bi_status ||
io->write.data_opts.scrub)) {
move_free(io);
return;
@@ -816,7 +816,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (!bp.v->level)
ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
else if (!data_opts.scrub)
- ret = bch2_btree_node_rewrite_key(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
+ ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
else
ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 4ecb721c..fa19fc44 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -74,20 +74,14 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
struct move_bucket *b, u64 time)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_alloc_v4 _a;
- const struct bch_alloc_v4 *a;
- int ret;
- if (bch2_bucket_is_open(trans->c,
- b->k.bucket.inode,
- b->k.bucket.offset))
+ if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
return 0;
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
- b->k.bucket, BTREE_ITER_cached);
- ret = bkey_err(k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+ b->k.bucket, BTREE_ITER_cached);
+ int ret = bkey_err(k);
if (ret)
return ret;
@@ -95,13 +89,18 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
if (!ca)
goto out;
- a = bch2_alloc_to_v4(k, &_a);
+ if (ca->mi.state != BCH_MEMBER_STATE_rw ||
+ !bch2_dev_is_online(ca))
+ goto out_put;
+
+ struct bch_alloc_v4 _a;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
b->k.gen = a->gen;
b->sectors = bch2_bucket_sectors_dirty(*a);
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
ret = lru_idx && lru_idx <= time;
-
+out_put:
bch2_dev_put(ca);
out:
bch2_trans_iter_exit(trans, &iter);
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 071a92ec..afb89d31 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -145,6 +145,11 @@ enum fsck_err_opts {
OPT_STR(bch2_error_actions), \
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
NULL, "Action to take on filesystem error") \
+ x(write_error_timeout, u16, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(1, 300), \
+ BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
+ NULL, "Number of consecutive write errors allowed before kicking out a device")\
x(metadata_replicas, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
diff --git a/libbcachefs/recovery_passes_types.h b/libbcachefs/recovery_passes_types.h
index 41855796..e89b9c78 100644
--- a/libbcachefs/recovery_passes_types.h
+++ b/libbcachefs/recovery_passes_types.h
@@ -24,7 +24,7 @@
x(check_topology, 4, 0) \
x(accounting_read, 39, PASS_ALWAYS) \
x(alloc_read, 0, PASS_ALWAYS) \
- x(stripes_read, 1, PASS_ALWAYS) \
+ x(stripes_read, 1, 0) \
x(initialize_subvolumes, 2, 0) \
x(snapshots_read, 3, PASS_ALWAYS) \
x(check_allocations, 5, PASS_FSCK) \
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
index 50118661..68172c6e 100644
--- a/libbcachefs/reflink.c
+++ b/libbcachefs/reflink.c
@@ -606,7 +606,7 @@ s64 bch2_remap_range(struct bch_fs *c,
u64 dst_done = 0;
u32 dst_snapshot, src_snapshot;
bool reflink_p_may_update_opts_field =
- bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
+ !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
int ret = 0, ret2 = 0;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c
index 21130ead..acb5d845 100644
--- a/libbcachefs/sb-downgrade.c
+++ b/libbcachefs/sb-downgrade.c
@@ -91,9 +91,6 @@
BCH_FSCK_ERR_accounting_mismatch, \
BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
BCH_FSCK_ERR_accounting_key_junk_at_end) \
- x(directory_size, \
- BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
- BCH_FSCK_ERR_directory_size_mismatch) \
x(cached_backpointers, \
BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
BCH_FSCK_ERR_ptr_to_missing_backpointer) \
diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h
index cdafd877..67455beb 100644
--- a/libbcachefs/sb-errors_format.h
+++ b/libbcachefs/sb-errors_format.h
@@ -179,6 +179,7 @@ enum bch_fsck_flags {
x(ptr_crc_redundant, 160, 0) \
x(ptr_crc_nonce_mismatch, 162, 0) \
x(ptr_stripe_redundant, 163, 0) \
+ x(extent_flags_not_at_start, 306, 0) \
x(reservation_key_nr_replicas_invalid, 164, 0) \
x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
x(reflink_v_pos_bad, 292, 0) \
@@ -316,7 +317,7 @@ enum bch_fsck_flags {
x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
x(dirent_cf_name_too_big, 304, 0) \
x(dirent_stray_data_after_cf_name, 305, 0) \
- x(MAX, 306, 0)
+ x(MAX, 307, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h
index b29b6c6c..38261638 100644
--- a/libbcachefs/sb-members.h
+++ b/libbcachefs/sb-members.h
@@ -35,7 +35,7 @@ static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
return ret;
}
-static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
{
return bch2_dev_is_online(ca) &&
ca->mi.state != BCH_MEMBER_STATE_failed;
@@ -283,6 +283,8 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev
static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
{
+ might_sleep();
+
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
if (ca && !percpu_ref_tryget(&ca->io_ref))
diff --git a/libbcachefs/six.c b/libbcachefs/six.c
index 7e7c66a1..7c403427 100644
--- a/libbcachefs/six.c
+++ b/libbcachefs/six.c
@@ -850,7 +850,8 @@ void six_lock_exit(struct six_lock *lock)
EXPORT_SYMBOL_GPL(six_lock_exit);
void __six_lock_init(struct six_lock *lock, const char *name,
- struct lock_class_key *key, enum six_lock_init_flags flags)
+ struct lock_class_key *key, enum six_lock_init_flags flags,
+ gfp_t gfp)
{
atomic_set(&lock->state, 0);
raw_spin_lock_init(&lock->wait_lock);
@@ -873,7 +874,7 @@ void __six_lock_init(struct six_lock *lock, const char *name,
* failure if they wish by checking lock->readers, but generally
* will not want to treat it as an error.
*/
- lock->readers = alloc_percpu(unsigned);
+ lock->readers = alloc_percpu_gfp(unsigned, gfp);
}
#endif
}
diff --git a/libbcachefs/six.h b/libbcachefs/six.h
index c142e06b..59b851cf 100644
--- a/libbcachefs/six.h
+++ b/libbcachefs/six.h
@@ -164,18 +164,19 @@ enum six_lock_init_flags {
};
void __six_lock_init(struct six_lock *lock, const char *name,
- struct lock_class_key *key, enum six_lock_init_flags flags);
+ struct lock_class_key *key, enum six_lock_init_flags flags,
+ gfp_t gfp);
/**
* six_lock_init - initialize a six lock
* @lock: lock to initialize
* @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU
*/
-#define six_lock_init(lock, flags) \
+#define six_lock_init(lock, flags, gfp) \
do { \
static struct lock_class_key __key; \
\
- __six_lock_init((lock), #lock, &__key, flags); \
+ __six_lock_init((lock), #lock, &__key, flags, gfp); \
} while (0)
/**
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 0edc8814..ee32d043 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -25,9 +25,6 @@
#include <linux/sort.h>
#include <linux/string_choices.h>
-static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
-};
-
struct bch2_metadata_version {
u16 version;
const char *name;
@@ -69,14 +66,22 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta
return v;
}
-void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
+int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
{
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
- max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
+ int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
+ version <= c->sb.version_incompat_allowed)
+ ? 0
+ : -BCH_ERR_may_not_use_incompat_feature;
+
+ if (!ret) {
+ mutex_lock(&c->sb_lock);
+ SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
+ max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
+ return ret;
}
const char * const bch2_sb_fields[] = {
@@ -366,7 +371,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id;
- u16 block_size;
int ret;
ret = bch2_sb_compatible(sb, out);
@@ -385,8 +389,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
return -BCH_ERR_invalid_sb_features;
}
- block_size = le16_to_cpu(sb->block_size);
-
if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
prt_printf(out, "Bad user UUID (got zeroes)");
return -BCH_ERR_invalid_sb_uuid;
@@ -452,6 +454,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
+
+ if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
+ SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
}
#ifdef __KERNEL__
@@ -743,7 +748,7 @@ retry:
memset(sb, 0, sizeof(*sb));
sb->mode = BLK_OPEN_READ;
sb->have_bio = true;
- sb->holder = kmalloc(1, GFP_KERNEL);
+ sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL);
if (!sb->holder)
return -ENOMEM;
@@ -906,16 +911,16 @@ static void write_super_endio(struct bio *bio)
{
struct bch_dev *ca = bio->bi_private;
+ bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status);
+
/* XXX: return errors directly */
- if (bch2_dev_io_err_on(bio->bi_status, ca,
- bio_data_dir(bio)
- ? BCH_MEMBER_ERROR_write
- : BCH_MEMBER_ERROR_read,
- "superblock %s error: %s",
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca, "superblock %s error: %s",
str_write_read(bio_data_dir(bio)),
- bch2_blk_status_to_str(bio->bi_status)))
+ bch2_blk_status_to_str(bio->bi_status));
ca->sb_write_error = 1;
+ }
closure_put(&ca->fs->sb_write);
percpu_ref_put(&ca->io_ref);
@@ -1154,7 +1159,7 @@ int bch2_write_super(struct bch_fs *c)
!can_mount_with_written), c,
": Unable to write superblock to sufficient devices (from %ps)",
(void *) _RET_IP_))
- ret = -1;
+ ret = -BCH_ERR_erofs_sb_err;
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
@@ -1211,11 +1216,12 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
c->disk_sb.sb->version = cpu_to_le16(new_version);
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
- if (incompat)
+ if (incompat) {
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
+ }
}
static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index f1ab4f94..167dd98f 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -21,17 +21,14 @@ static inline bool bch2_version_compatible(u16 version)
void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
-void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
+int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
-static inline bool bch2_request_incompat_feature(struct bch_fs *c,
- enum bcachefs_metadata_version version)
+static inline int bch2_request_incompat_feature(struct bch_fs *c,
+ enum bcachefs_metadata_version version)
{
- if (unlikely(version > c->sb.version_incompat)) {
- if (version > c->sb.version_incompat_allowed)
- return false;
- bch2_set_version_incompat(c, version);
- }
- return true;
+ return likely(version <= c->sb.version_incompat)
+ ? 0
+ : bch2_set_version_incompat(c, version);
}
static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 10c281ad..cffad3b6 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -1075,6 +1075,7 @@ int bch2_fs_start(struct bch_fs *c)
}
set_bit(BCH_FS_started, &c->flags);
+ wake_up(&c->ro_ref_wait);
if (c->opts.read_only) {
bch2_fs_read_only(c);
@@ -1431,6 +1432,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
ca->disk_sb = *sb;
memset(sb, 0, sizeof(*sb));
+ /*
+ * Stash pointer to the filesystem for blk_holder_ops - note that once
+ * attached to a filesystem, we will always close the block device
+ * before tearing down the filesystem object.
+ */
+ ca->disk_sb.holder->c = ca->fs;
+
ca->dev = ca->disk_sb.bdev->bd_dev;
percpu_ref_reinit(&ca->io_ref);
@@ -2016,6 +2024,102 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
+/* blk_holder_ops: */
+
+static struct bch_fs *bdev_get_fs(struct block_device *bdev)
+ __releases(&bdev->bd_holder_lock)
+{
+ struct bch_sb_handle_holder *holder = bdev->bd_holder;
+ struct bch_fs *c = holder->c;
+
+ if (c && !bch2_ro_ref_tryget(c))
+ c = NULL;
+
+ mutex_unlock(&bdev->bd_holder_lock);
+
+ if (c)
+ wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
+ return c;
+}
+
+/* returns with ref on ca->ref */
+static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
+{
+ for_each_member_device(c, ca)
+ if (ca->disk_sb.bdev == bdev)
+ return ca;
+ return NULL;
+}
+
+static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
+{
+ struct bch_fs *c = bdev_get_fs(bdev);
+ if (!c)
+ return;
+
+ struct super_block *sb = c->vfs_sb;
+ if (sb) {
+ /*
+ * Not necessary, c->ro_ref guards against the filesystem being
+ * unmounted - we only take this to avoid a warning in
+ * sync_filesystem:
+ */
+ down_read(&sb->s_umount);
+ }
+
+ down_write(&c->state_lock);
+ struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
+ if (!ca)
+ goto unlock;
+
+ if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) {
+ __bch2_dev_offline(c, ca);
+ } else {
+ if (sb) {
+ if (!surprise)
+ sync_filesystem(sb);
+ shrink_dcache_sb(sb);
+ evict_inodes(sb);
+ }
+
+ bch2_journal_flush(&c->journal);
+ bch2_fs_emergency_read_only(c);
+ }
+
+ bch2_dev_put(ca);
+unlock:
+ if (sb)
+ up_read(&sb->s_umount);
+ up_write(&c->state_lock);
+ bch2_ro_ref_put(c);
+}
+
+static void bch2_fs_bdev_sync(struct block_device *bdev)
+{
+ struct bch_fs *c = bdev_get_fs(bdev);
+ if (!c)
+ return;
+
+ struct super_block *sb = c->vfs_sb;
+ if (sb) {
+ /*
+ * Not necessary, c->ro_ref guards against the filesystem being
+ * unmounted - we only take this to avoid a warning in
+ * sync_filesystem:
+ */
+ down_read(&sb->s_umount);
+ sync_filesystem(sb);
+ up_read(&sb->s_umount);
+ }
+
+ bch2_ro_ref_put(c);
+}
+
+const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+ .mark_dead = bch2_fs_bdev_mark_dead,
+ .sync = bch2_fs_bdev_sync,
+};
+
/* Filesystem open: */
static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index 04f8287e..23533bce 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -42,4 +42,6 @@ void bch2_fs_stop(struct bch_fs *);
int bch2_fs_start(struct bch_fs *);
struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
+extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
+
#endif /* _BCACHEFS_SUPER_H */
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h
index 368a63d9..3a899f79 100644
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -2,13 +2,19 @@
#ifndef _BCACHEFS_SUPER_TYPES_H
#define _BCACHEFS_SUPER_TYPES_H
+struct bch_fs;
+
+struct bch_sb_handle_holder {
+ struct bch_fs *c;
+};
+
struct bch_sb_handle {
struct bch_sb *sb;
struct file *s_bdev_file;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
- void *holder;
+ struct bch_sb_handle_holder *holder;
size_t buffer_size;
blk_mode_t mode;
unsigned have_layout:1;
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index a9953181..2ed3f755 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -174,7 +174,6 @@ read_attribute(journal_debug);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(btree_reserve_cache);
-read_attribute(stripes_heap);
read_attribute(open_buckets);
read_attribute(open_buckets_partial);
read_attribute(nocow_lock_table);
@@ -355,9 +354,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_btree_reserve_cache)
bch2_btree_reserve_cache_to_text(out, c);
- if (attr == &sysfs_stripes_heap)
- bch2_stripes_heap_to_text(out, c);
-
if (attr == &sysfs_open_buckets)
bch2_open_buckets_to_text(out, c, NULL);
@@ -566,7 +562,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_btree_key_cache,
&sysfs_btree_reserve_cache,
&sysfs_new_stripes,
- &sysfs_stripes_heap,
&sysfs_open_buckets,
&sysfs_open_buckets_partial,
#ifdef BCH_WRITE_REF_DEBUG
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
index 5718988d..c8669a6b 100644
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@@ -339,6 +339,30 @@ DEFINE_EVENT(bio, io_read_reuse_race,
TP_ARGS(bio)
);
+/* ec.c */
+
+TRACE_EVENT(stripe_create,
+ TP_PROTO(struct bch_fs *c, u64 idx, int ret),
+ TP_ARGS(c, idx, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u64, idx )
+ __field(int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->idx = idx;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("%d,%d idx %llu ret %i",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->idx,
+ __entry->ret)
+);
+
/* Journal */
DEFINE_EVENT(bch_fs, journal_full,
diff --git a/linux/blkdev.c b/linux/blkdev.c
index e496fc11..eb257d8b 100644
--- a/linux/blkdev.c
+++ b/linux/blkdev.c
@@ -208,6 +208,8 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
bdev->queue.backing_dev_info = bdev->bd_disk->bdi;
bdev->bd_inode = &bdev->__bd_inode;
+ mutex_init(&bdev->bd_holder_lock);
+
struct file *file = calloc(sizeof(*file), 1);
file->f_inode = bdev->bd_inode;