summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-03-08 17:49:16 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2018-05-22 00:44:18 -0400
commitb5189c85527cd13ca46640bf3206678346154118 (patch)
tree8f28962ff91c504bf7c06bdd8f669adbb441d80a
parent9ac260ea3f7ccb646eb635a41270763116329d06 (diff)
bcachefs: Centralize error handling in read path
prep work for erasure coding - need to plumb in reconstruct reads
-rw-r--r--fs/bcachefs/btree_io.c98
-rw-r--r--fs/bcachefs/btree_io.h2
-rw-r--r--fs/bcachefs/debug.c12
-rw-r--r--fs/bcachefs/extents.c85
-rw-r--r--fs/bcachefs/extents.h12
-rw-r--r--fs/bcachefs/extents_types.h1
-rw-r--r--fs/bcachefs/fs-io.c48
-rw-r--r--fs/bcachefs/io.c493
-rw-r--r--fs/bcachefs/io.h28
-rw-r--r--fs/bcachefs/io_types.h12
-rw-r--r--fs/bcachefs/move.c25
-rw-r--r--fs/bcachefs/super.h21
12 files changed, 460 insertions, 377 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 0525c3b87f95..42190f05a0c1 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1323,37 +1323,48 @@ static void btree_node_read_work(struct work_struct *work)
struct btree_read_bio *rb =
container_of(work, struct btree_read_bio, work);
struct bch_fs *c = rb->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
struct btree *b = rb->bio.bi_private;
struct bio *bio = &rb->bio;
struct bch_devs_mask avoid;
+ bool can_retry;
memset(&avoid, 0, sizeof(avoid));
goto start;
- do {
+ while (1) {
bch_info(c, "retrying read");
+ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
bio_reset(bio);
- bio_set_dev(bio, rb->pick.ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
- submit_bio_wait(bio);
+
+ if (rb->have_ioref) {
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ submit_bio_wait(bio);
+ } else {
+ bio->bi_status = BLK_STS_REMOVED;
+ }
start:
- bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
- percpu_ref_put(&rb->pick.ca->io_ref);
+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
+ if (rb->have_ioref)
+ percpu_ref_put(&ca->io_ref);
+ rb->have_ioref = false;
- __set_bit(rb->pick.ca->dev_idx, avoid.d);
- rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
+ __set_bit(rb->pick.ptr.dev, avoid.d);
+ can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;
if (!bio->bi_status &&
- !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
- goto out;
- } while (!IS_ERR_OR_NULL(rb->pick.ca));
+ !bch2_btree_node_read_done(c, b, can_retry))
+ break;
- set_btree_node_read_error(b);
-out:
- if (!IS_ERR_OR_NULL(rb->pick.ca))
- percpu_ref_put(&rb->pick.ca->io_ref);
+ if (!can_retry) {
+ set_btree_node_read_error(b);
+ break;
+ }
+ }
bch2_time_stats_update(&c->btree_read_time, rb->start_time);
bio_put(&rb->bio);
@@ -1365,10 +1376,13 @@ static void btree_node_read_endio(struct bio *bio)
{
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
+ struct bch_fs *c = rb->c;
- bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
+ if (rb->have_ioref) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ bch2_latency_acct(ca, rb->start_time >> 10, READ);
+ }
- INIT_WORK(&rb->work, btree_node_read_work);
queue_work(system_unbound_wq, &rb->work);
}
@@ -1377,41 +1391,58 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
{
struct extent_pick_ptr pick;
struct btree_read_bio *rb;
+ struct bch_dev *ca;
struct bio *bio;
+ int ret;
trace_btree_read(c, b);
- pick = bch2_btree_pick_ptr(c, b, NULL);
- if (bch2_fs_fatal_err_on(!pick.ca, c,
+ ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
+ if (bch2_fs_fatal_err_on(ret <= 0, c,
"btree node read error: no device to read from")) {
set_btree_node_read_error(b);
return;
}
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
rb = container_of(bio, struct btree_read_bio, bio);
rb->c = c;
rb->start_time = local_clock();
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
rb->pick = pick;
- bio_set_dev(bio, pick.ca->disk_sb.bdev);
+ INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
+ bio->bi_end_io = btree_node_read_endio;
+ bio->bi_private = b;
bch2_bio_map(bio, b->data);
- this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE],
- bio_sectors(bio));
-
set_btree_node_read_in_flight(b);
- if (sync) {
- submit_bio_wait(bio);
- bio->bi_private = b;
- btree_node_read_work(&rb->work);
+ if (rb->have_ioref) {
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
+ bio_sectors(bio));
+ bio_set_dev(bio, ca->disk_sb.bdev);
+
+ if (sync) {
+ submit_bio_wait(bio);
+
+ bio->bi_private = b;
+ btree_node_read_work(&rb->work);
+ } else {
+ submit_bio(bio);
+ }
} else {
- bio->bi_end_io = btree_node_read_endio;
- bio->bi_private = b;
- submit_bio(bio);
+ bio->bi_status = BLK_STS_REMOVED;
+
+ if (sync)
+ btree_node_read_work(&rb->work);
+ else
+ queue_work(system_unbound_wq, &rb->work);
+
}
}
@@ -1593,20 +1624,21 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_write_bio *orig = parent ?: wbio;
struct bch_fs *c = wbio->c;
- struct bch_dev *ca = wbio->ca;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
unsigned long flags;
- bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+ if (wbio->have_ioref)
+ bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
if (bio->bi_status == BLK_STS_REMOVED ||
bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
- bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+ bch2_dev_list_add_dev(&orig->failed, wbio->dev);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
}
- if (wbio->have_io_ref)
+ if (wbio->have_ioref)
percpu_ref_put(&ca->io_ref);
if (parent) {
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 01df817d3eeb..947685f925b1 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -12,8 +12,8 @@ struct btree_iter;
struct btree_read_bio {
struct bch_fs *c;
- unsigned submit_time_us;
u64 start_time;
+ unsigned have_ioref:1;
struct extent_pick_ptr pick;
struct work_struct work;
struct bio bio;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 7190990dbfa1..71f649bc4c7f 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -36,6 +36,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
struct bset *sorted, *inmemory;
struct extent_pick_ptr pick;
+ struct bch_dev *ca;
struct bio *bio;
if (c->opts.nochanges)
@@ -54,12 +55,15 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
v->btree_id = b->btree_id;
bch2_btree_keys_init(v, &c->expensive_debug_checks);
- pick = bch2_btree_pick_ptr(c, b, NULL);
- if (IS_ERR_OR_NULL(pick.ca))
+ if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
+ return;
+
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ if (!bch2_dev_get_ioref(ca, READ))
return;
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
- bio_set_dev(bio, pick.ca->disk_sb.bdev);
+ bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_READ|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
@@ -68,7 +72,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
submit_bio_wait(bio);
bio_put(bio);
- percpu_ref_put(&pick.ca->io_ref);
+ percpu_ref_put(&ca->io_ref);
memcpy(n_ondisk, n_sorted, btree_bytes(c));
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index c5d1e7cb539b..c6f969c4cfae 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -588,9 +588,12 @@ out:
return out - buf;
}
-static inline bool dev_latency_better(struct bch_dev *dev1,
- struct bch_dev *dev2)
+static inline bool dev_latency_better(struct bch_fs *c,
+ const struct bch_extent_ptr *ptr1,
+ const struct bch_extent_ptr *ptr2)
{
+ struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev);
+ struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev);
unsigned l1 = atomic_read(&dev1->latency[READ]);
unsigned l2 = atomic_read(&dev2->latency[READ]);
@@ -599,47 +602,37 @@ static inline bool dev_latency_better(struct bch_dev *dev1,
return bch2_rand_range(l1 + l2) > l1;
}
-static void extent_pick_read_device(struct bch_fs *c,
- struct bkey_s_c_extent e,
- struct bch_devs_mask *avoid,
- struct extent_pick_ptr *pick)
+static int extent_pick_read_device(struct bch_fs *c,
+ struct bkey_s_c_extent e,
+ struct bch_devs_mask *avoid,
+ struct extent_pick_ptr *pick)
{
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
+ struct bch_dev *ca;
+ int ret = 0;
extent_for_each_ptr_crc(e, ptr, crc) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr->cached && ptr_stale(ca, ptr))
continue;
- if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+ if (avoid && test_bit(ptr->dev, avoid->d))
continue;
- if (avoid) {
- if (test_bit(ca->dev_idx, avoid->d))
- continue;
-
- if (pick->ca &&
- test_bit(pick->ca->dev_idx, avoid->d))
- goto use;
- }
-
- if (pick->ca && !dev_latency_better(ca, pick->ca))
- continue;
-use:
- if (!percpu_ref_tryget(&ca->io_ref))
+ if (ret && !dev_latency_better(c, ptr, &pick->ptr))
continue;
- if (pick->ca)
- percpu_ref_put(&pick->ca->io_ref);
-
*pick = (struct extent_pick_ptr) {
.ptr = *ptr,
.crc = crc,
- .ca = ca,
};
+
+ ret = 1;
}
+
+ return ret;
}
/* Btree ptrs */
@@ -759,16 +752,12 @@ void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
#undef p
}
-struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
- struct bch_devs_mask *avoid)
+int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
+ struct bch_devs_mask *avoid,
+ struct extent_pick_ptr *pick)
{
- struct extent_pick_ptr pick = { .ca = NULL };
-
- extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
- avoid, &pick);
-
- return pick;
+ return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
+ avoid, pick);
}
/* Extents */
@@ -2057,37 +2046,33 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
* Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
* other devices, it will still pick a pointer from avoid.
*/
-void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
- struct bch_devs_mask *avoid,
- struct extent_pick_ptr *ret)
+int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_devs_mask *avoid,
+ struct extent_pick_ptr *pick)
{
- struct bkey_s_c_extent e;
+ int ret;
switch (k.k->type) {
case KEY_TYPE_DELETED:
case KEY_TYPE_DISCARD:
case KEY_TYPE_COOKIE:
- ret->ca = NULL;
- return;
+ return 0;
case KEY_TYPE_ERROR:
- ret->ca = ERR_PTR(-EIO);
- return;
+ return -EIO;
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
- e = bkey_s_c_to_extent(k);
- ret->ca = NULL;
+ ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
+ avoid, pick);
- extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret);
+ if (!ret && !bkey_extent_is_cached(k.k))
+ ret = -EIO;
- if (!ret->ca && !bkey_extent_is_cached(e.k))
- ret->ca = ERR_PTR(-EIO);
- return;
+ return ret;
case BCH_RESERVATION:
- ret->ca = NULL;
- return;
+ return 0;
default:
BUG();
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 8dc15484f485..338e9e01cf5d 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -53,13 +53,13 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
struct btree *,
struct btree_node_iter_large *);
-struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
- struct bch_devs_mask *avoid);
+int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
+ struct bch_devs_mask *avoid,
+ struct extent_pick_ptr *);
-void bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
- struct bch_devs_mask *,
- struct extent_pick_ptr *);
+int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
+ struct bch_devs_mask *,
+ struct extent_pick_ptr *);
enum btree_insert_ret
bch2_insert_fixup_extent(struct btree_insert *,
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
index 15805cd29ddb..76139f931fe0 100644
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
@@ -21,7 +21,6 @@ struct bch_extent_crc_unpacked {
struct extent_pick_ptr {
struct bch_extent_ptr ptr;
struct bch_extent_crc_unpacked crc;
- struct bch_dev *ca;
};
#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 86e1ba9042af..e2855743540a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -993,11 +993,9 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
BCH_READ_MAY_PROMOTE;
while (1) {
- struct extent_pick_ptr pick;
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
unsigned bytes;
- bool is_last;
bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
@@ -1016,45 +1014,37 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_iter_unlock(iter);
k = bkey_i_to_s_c(&tmp.k);
- bch2_extent_pick_ptr(c, k, NULL, &pick);
- if (IS_ERR(pick.ca)) {
- bcache_io_error(c, bio, "no device to read from");
- bio_endio(bio);
- return;
- }
+ if (readpages_iter) {
+ bool want_full_extent = false;
+
+ if (bkey_extent_is_data(k.k)) {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
+
+ extent_for_each_ptr_crc(e, ptr, crc)
+ want_full_extent |= !!crc.csum_type |
+ !!crc.compression_type;
+ }
- if (readpages_iter)
readpage_bio_extend(readpages_iter,
bio, k.k->p.offset,
- pick.ca &&
- (pick.crc.csum_type ||
- pick.crc.compression_type));
+ want_full_extent);
+ }
bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
bio->bi_iter.bi_sector) << 9;
- is_last = bytes == bio->bi_iter.bi_size;
swap(bio->bi_iter.bi_size, bytes);
+ if (bytes == bio->bi_iter.bi_size)
+ flags |= BCH_READ_LAST_FRAGMENT;
+
if (bkey_extent_is_allocation(k.k))
bch2_add_page_sectors(bio, k);
- if (pick.ca) {
- if (!is_last) {
- bio_inc_remaining(&rbio->bio);
- flags |= BCH_READ_MUST_CLONE;
- trace_read_split(&rbio->bio);
- }
-
- bch2_read_extent(c, rbio, bkey_s_c_to_extent(k),
- &pick, flags);
- } else {
- zero_fill_bio(bio);
-
- if (is_last)
- bio_endio(bio);
- }
+ bch2_read_extent(c, rbio, k, flags);
- if (is_last)
+ if (flags & BCH_READ_LAST_FRAGMENT)
return;
swap(bio->bi_iter.bi_size, bytes);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 308ba4ddbe1b..e0fb40bc6381 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -169,22 +169,21 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
}
n->c = c;
- n->ca = ca;
+ n->dev = ptr->dev;
+ n->have_ioref = bch2_dev_get_ioref(ca, WRITE);
n->submit_time_us = local_clock_us();
n->bio.bi_iter.bi_sector = ptr->offset;
if (!journal_flushes_device(ca))
n->bio.bi_opf |= REQ_FUA;
- if (likely(percpu_ref_tryget(&ca->io_ref))) {
+ if (likely(n->have_ioref)) {
this_cpu_add(ca->io_done->sectors[WRITE][type],
bio_sectors(&n->bio));
- n->have_io_ref = true;
bio_set_dev(&n->bio, ca->disk_sb.bdev);
submit_bio(&n->bio);
} else {
- n->have_io_ref = false;
n->bio.bi_status = BLK_STS_REMOVED;
bio_endio(&n->bio);
}
@@ -318,15 +317,15 @@ static void bch2_write_endio(struct bio *bio)
struct bch_write_bio *wbio = to_wbio(bio);
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_fs *c = wbio->c;
- struct bch_dev *ca = wbio->ca;
-
- bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
- set_bit(ca->dev_idx, op->failed.d);
+ set_bit(wbio->dev, op->failed.d);
- if (wbio->have_io_ref)
+ if (wbio->have_ioref) {
+ bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
percpu_ref_put(&ca->io_ref);
+ }
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
@@ -931,7 +930,7 @@ static struct promote_op *promote_alloc(struct bch_read_bio *rbio,
return op;
}
-static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e,
+static bool should_promote(struct bch_fs *c, struct bkey_s_c k,
unsigned flags, u16 target)
{
if (!target)
@@ -943,15 +942,14 @@ static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e,
if (percpu_ref_is_dying(&c->writes))
return false;
- return bch2_extent_has_target(c, e, target) == NULL;
+ if (!bkey_extent_is_data(k.k))
+ return false;
+
+ return bch2_extent_has_target(c, bkey_s_c_to_extent(k), target) == NULL;
}
/* Read */
-static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *,
- struct bvec_iter, u64,
- struct bch_devs_mask *, unsigned);
-
#define READ_RETRY_AVOID 1
#define READ_RETRY 2
#define READ_ERR 3
@@ -984,27 +982,123 @@ static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
{
- struct bch_read_bio *parent = rbio->parent;
-
- BUG_ON(!rbio->split);
+ BUG_ON(rbio->bounce && !rbio->split);
if (rbio->promote)
kfree(rbio->promote);
+ rbio->promote = NULL;
+
if (rbio->bounce)
bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
- bio_put(&rbio->bio);
- return parent;
+ if (rbio->split) {
+ struct bch_read_bio *parent = rbio->parent;
+
+ bio_put(&rbio->bio);
+ rbio = parent;
+ }
+
+ return rbio;
}
-static void bch2_rbio_done(struct bch_read_bio *rbio)
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, u64 inode,
+ struct bch_devs_mask *avoid, unsigned flags)
{
- if (rbio->promote)
- kfree(rbio->promote);
- rbio->promote = NULL;
+ struct btree_iter iter;
+ BKEY_PADDED(k) tmp;
+ struct bkey_s_c k;
+ int ret;
- if (rbio->split)
- rbio = bch2_rbio_free(rbio);
+ flags &= ~BCH_READ_LAST_FRAGMENT;
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+ rbio->pos, BTREE_ITER_SLOTS);
+retry:
+ rbio->bio.bi_status = 0;
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ if (btree_iter_err(k)) {
+ bch2_btree_iter_unlock(&iter);
+ goto err;
+ }
+
+ bkey_reassemble(&tmp.k, k);
+ k = bkey_i_to_s_c(&tmp.k);
+ bch2_btree_iter_unlock(&iter);
+
+ if (!bkey_extent_is_data(k.k) ||
+ !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
+ rbio->pick.ptr,
+ rbio->pos.offset -
+ rbio->pick.crc.offset)) {
+ /* extent we wanted to read no longer exists: */
+ rbio->hole = true;
+ goto out;
+ }
+
+ ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+ if (ret == READ_RETRY)
+ goto retry;
+ if (ret)
+ goto err;
+ goto out;
+err:
+ rbio->bio.bi_status = BLK_STS_IOERR;
+out:
+ bio_endio(&rbio->bio);
+}
+
+static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, u64 inode,
+ struct bch_devs_mask *avoid, unsigned flags)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ flags &= ~BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_MUST_CLONE;
+retry:
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+ POS(inode, bvec_iter.bi_sector),
+ BTREE_ITER_SLOTS, k) {
+ BKEY_PADDED(k) tmp;
+ unsigned bytes;
+
+ bkey_reassemble(&tmp.k, k);
+ k = bkey_i_to_s_c(&tmp.k);
+ bch2_btree_iter_unlock(&iter);
+
+ bytes = min_t(unsigned, bvec_iter.bi_size,
+ (k.k->p.offset - bvec_iter.bi_sector) << 9);
+ swap(bvec_iter.bi_size, bytes);
+
+ ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+ switch (ret) {
+ case READ_RETRY:
+ goto retry;
+ case READ_ERR:
+ goto err;
+ };
+
+ if (bytes == bvec_iter.bi_size)
+ goto out;
+
+ swap(bvec_iter.bi_size, bytes);
+ bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+ }
+
+ /*
+ * If we get here, it better have been because there was an error
+ * reading a btree node
+ */
+ ret = bch2_btree_iter_unlock(&iter);
+ BUG_ON(!ret);
+ __bcache_io_error(c, "btree IO error %i", ret);
+err:
+ rbio->bio.bi_status = BLK_STS_IOERR;
+out:
bio_endio(&rbio->bio);
}
@@ -1012,10 +1106,10 @@ static void bch2_rbio_retry(struct work_struct *work)
{
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bvec_iter iter = rbio->bvec_iter;
- unsigned flags = rbio->flags;
- u64 inode = rbio->pos.inode;
+ struct bch_fs *c = rbio->c;
+ struct bvec_iter iter = rbio->bvec_iter;
+ unsigned flags = rbio->flags;
+ u64 inode = rbio->pos.inode;
struct bch_devs_mask avoid;
trace_read_retry(&rbio->bio);
@@ -1023,26 +1117,19 @@ static void bch2_rbio_retry(struct work_struct *work)
memset(&avoid, 0, sizeof(avoid));
if (rbio->retry == READ_RETRY_AVOID)
- __set_bit(rbio->pick.ca->dev_idx, avoid.d);
+ __set_bit(rbio->pick.ptr.dev, avoid.d);
- if (rbio->promote)
- kfree(rbio->promote);
- rbio->promote = NULL;
+ rbio->bio.bi_status = 0;
- if (rbio->split)
- rbio = bch2_rbio_free(rbio);
- else
- rbio->bio.bi_status = 0;
+ rbio = bch2_rbio_free(rbio);
- if (!(flags & BCH_READ_NODECODE))
- flags |= BCH_READ_MUST_CLONE;
flags |= BCH_READ_IN_RETRY;
flags &= ~BCH_READ_MAY_PROMOTE;
if (flags & BCH_READ_NODECODE)
- bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags);
+ bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags);
else
- __bch2_read(c, rbio, iter, inode, &avoid, flags);
+ bch2_read_retry(c, rbio, iter, inode, &avoid, flags);
}
static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
@@ -1054,8 +1141,10 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
return;
if (retry == READ_ERR) {
- bch2_rbio_parent(rbio)->bio.bi_status = error;
- bch2_rbio_done(rbio);
+ rbio = bch2_rbio_free(rbio);
+
+ rbio->bio.bi_status = error;
+ bio_endio(&rbio->bio);
} else {
bch2_rbio_punt(rbio, bch2_rbio_retry,
RBIO_CONTEXT_UNBOUND, system_unbound_wq);
@@ -1126,12 +1215,13 @@ out:
bch2_btree_iter_unlock(&iter);
}
-static bool should_narrow_crcs(struct bkey_s_c_extent e,
+static bool should_narrow_crcs(struct bkey_s_c k,
struct extent_pick_ptr *pick,
unsigned flags)
{
return !(flags & BCH_READ_IN_RETRY) &&
- bch2_can_narrow_extent_crcs(e, pick->crc);
+ bkey_extent_is_data(k.k) &&
+ bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
}
/* Inner part that may run in process context */
@@ -1139,8 +1229,10 @@ static void __bch2_read_endio(struct work_struct *work)
{
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio;
+ struct bch_fs *c = rbio->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+ struct bio *src = &rbio->bio;
+ struct bio *dst = &bch2_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
struct nonce nonce = extent_nonce(rbio->version, crc);
@@ -1198,8 +1290,10 @@ static void __bch2_read_endio(struct work_struct *work)
promote_start(rbio->promote, rbio);
}
nodecode:
- if (likely(!(rbio->flags & BCH_READ_IN_RETRY)))
- bch2_rbio_done(rbio);
+ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+ rbio = bch2_rbio_free(rbio);
+ bio_endio(&rbio->bio);
+ }
return;
csum_err:
/*
@@ -1213,7 +1307,7 @@ csum_err:
return;
}
- bch2_dev_io_error(rbio->pick.ca,
+ bch2_dev_io_error(ca,
"data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
@@ -1232,25 +1326,27 @@ static void bch2_read_endio(struct bio *bio)
{
struct bch_read_bio *rbio =
container_of(bio, struct bch_read_bio, bio);
- struct bch_fs *c = rbio->c;
+ struct bch_fs *c = rbio->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
struct workqueue_struct *wq = NULL;
enum rbio_context context = RBIO_CONTEXT_NULL;
- bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ);
-
- percpu_ref_put(&rbio->pick.ca->io_ref);
+ if (rbio->have_ioref) {
+ bch2_latency_acct(ca, rbio->submit_time_us, READ);
+ percpu_ref_put(&ca->io_ref);
+ }
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) {
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
}
if (rbio->pick.ptr.cached &&
(((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) {
+ ptr_stale(ca, &rbio->pick.ptr))) {
atomic_long_inc(&c->read_realloc_races);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
@@ -1271,72 +1367,92 @@ static void bch2_read_endio(struct bio *bio)
}
int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
- struct bvec_iter iter, struct bkey_s_c_extent e,
- struct extent_pick_ptr *pick, unsigned flags)
+ struct bvec_iter iter, struct bkey_s_c k,
+ struct bch_devs_mask *avoid, unsigned flags)
{
+ struct extent_pick_ptr pick;
struct bch_read_bio *rbio;
+ struct bch_dev *ca;
bool split = false, bounce = false, read_full = false;
bool promote = false, narrow_crcs = false;
- struct bpos pos = bkey_start_pos(e.k);
- int ret = 0;
+ struct bpos pos = bkey_start_pos(k.k);
+ int pick_ret;
- lg_local_lock(&c->usage_lock);
- bucket_io_clock_reset(c, pick->ca,
- PTR_BUCKET_NR(pick->ca, &pick->ptr), READ);
- lg_local_unlock(&c->usage_lock);
+ pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick);
+
+ /* hole or reservation - just zero fill: */
+ if (!pick_ret)
+ goto hole;
+
+ if (pick_ret < 0)
+ goto no_device;
- narrow_crcs = should_narrow_crcs(e, pick, flags);
+ if (pick_ret > 0)
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
if (flags & BCH_READ_NODECODE) {
- BUG_ON(iter.bi_size < pick->crc.compressed_size << 9);
- iter.bi_size = pick->crc.compressed_size << 9;
+ /*
+ * can happen if we retry, and the extent we were going to read
+ * has been merged in the meantime:
+ */
+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+ goto hole;
+
+ iter.bi_sector = pos.offset;
+ iter.bi_size = pick.crc.compressed_size << 9;
goto noclone;
}
+ if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+ bio_flagged(&orig->bio, BIO_CHAIN))
+ flags |= BCH_READ_MUST_CLONE;
+
+ narrow_crcs = should_narrow_crcs(k, &pick, flags);
+
if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
flags |= BCH_READ_MUST_BOUNCE;
- EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector ||
- e.k->p.offset < bvec_iter_end_sector(iter));
+ EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
+ k.k->p.offset < bvec_iter_end_sector(iter));
- if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
- (pick->crc.csum_type != BCH_CSUM_NONE &&
- (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
- (bch2_csum_type_is_encryption(pick->crc.csum_type) &&
+ if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
+ (pick.crc.csum_type != BCH_CSUM_NONE &&
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+ (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
(flags & BCH_READ_USER_MAPPED)) ||
(flags & BCH_READ_MUST_BOUNCE)))) {
read_full = true;
bounce = true;
}
- promote = should_promote(c, e, flags, orig->opts.promote_target);
+ promote = should_promote(c, k, flags, orig->opts.promote_target);
/* could also set read_full */
if (promote)
bounce = true;
if (!read_full) {
- EBUG_ON(pick->crc.compression_type);
- EBUG_ON(pick->crc.csum_type &&
- (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
- bvec_iter_sectors(iter) != pick->crc.live_size ||
- pick->crc.offset ||
+ EBUG_ON(pick.crc.compression_type);
+ EBUG_ON(pick.crc.csum_type &&
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+ bvec_iter_sectors(iter) != pick.crc.live_size ||
+ pick.crc.offset ||
iter.bi_sector != pos.offset));
- pick->ptr.offset += pick->crc.offset +
+ pick.ptr.offset += pick.crc.offset +
(iter.bi_sector - pos.offset);
- pick->crc.compressed_size = bvec_iter_sectors(iter);
- pick->crc.uncompressed_size = bvec_iter_sectors(iter);
- pick->crc.offset = 0;
- pick->crc.live_size = bvec_iter_sectors(iter);
+ pick.crc.compressed_size = bvec_iter_sectors(iter);
+ pick.crc.uncompressed_size = bvec_iter_sectors(iter);
+ pick.crc.offset = 0;
+ pick.crc.live_size = bvec_iter_sectors(iter);
pos.offset = iter.bi_sector;
}
if (bounce) {
- unsigned sectors = pick->crc.compressed_size;
+ unsigned sectors = pick.crc.compressed_size;
rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
- DIV_ROUND_UP(sectors, PAGE_SECTORS),
- &c->bio_read_split),
+ DIV_ROUND_UP(sectors, PAGE_SECTORS),
+ &c->bio_read_split),
orig->opts);
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
@@ -1363,7 +1479,7 @@ noclone:
BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
}
- BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size);
+ BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
rbio->c = c;
if (split)
@@ -1375,135 +1491,117 @@ noclone:
rbio->flags = flags;
rbio->bounce = bounce;
rbio->split = split;
+ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
rbio->narrow_crcs = narrow_crcs;
+ rbio->hole = 0;
rbio->retry = 0;
rbio->context = 0;
- rbio->devs_have = bch2_extent_devs(e);
- rbio->pick = *pick;
+ rbio->devs_have = bch2_bkey_devs(k);
+ rbio->pick = pick;
rbio->pos = pos;
- rbio->version = e.k->version;
- rbio->promote = promote ? promote_alloc(rbio, e.s_c) : NULL;
+ rbio->version = k.k->version;
+ rbio->promote = promote ? promote_alloc(rbio, k) : NULL;
INIT_WORK(&rbio->work, NULL);
- bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev);
rbio->bio.bi_opf = orig->bio.bi_opf;
- rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
+ rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
if (bounce)
trace_read_bounce(&rbio->bio);
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
- this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER],
+
+ if (!rbio->have_ioref)
+ goto no_device_postclone;
+
+ lg_local_lock(&c->usage_lock);
+ bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
+ lg_local_unlock(&c->usage_lock);
+
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
bio_sectors(&rbio->bio));
+ bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
if (likely(!(flags & BCH_READ_IN_RETRY))) {
+ if (!(flags & BCH_READ_LAST_FRAGMENT)) {
+ bio_inc_remaining(&orig->bio);
+ trace_read_split(&orig->bio);
+ }
+
submit_bio(&rbio->bio);
+ return 0;
} else {
+ int ret;
+
submit_bio_wait(&rbio->bio);
rbio->context = RBIO_CONTEXT_UNBOUND;
bch2_read_endio(&rbio->bio);
ret = rbio->retry;
- if (rbio->split)
- rbio = bch2_rbio_free(rbio);
- if (!ret)
- bch2_rbio_done(rbio);
- }
-
- return ret;
-}
-
-static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
- struct bch_devs_mask *avoid, unsigned flags)
-{
- struct extent_pick_ptr pick;
- struct btree_iter iter;
- BKEY_PADDED(k) tmp;
- struct bkey_s_c k;
- int ret;
-
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
- POS(inode, bvec_iter.bi_sector),
- BTREE_ITER_SLOTS);
-retry:
- k = bch2_btree_iter_peek_slot(&iter);
- if (btree_iter_err(k)) {
- bch2_btree_iter_unlock(&iter);
- goto err;
- }
-
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
- bch2_btree_iter_unlock(&iter);
+ rbio = bch2_rbio_free(rbio);
- if (!bkey_extent_is_data(k.k) ||
- !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
- rbio->pick.ptr,
- rbio->pos.offset -
- rbio->pick.crc.offset) ||
- bkey_start_offset(k.k) != bvec_iter.bi_sector)
- goto err;
+ if (ret == READ_RETRY_AVOID) {
+ __set_bit(pick.ptr.dev, avoid->d);
+ ret = READ_RETRY;
+ }
- bch2_extent_pick_ptr(c, k, avoid, &pick);
- if (IS_ERR(pick.ca)) {
- bcache_io_error(c, &rbio->bio, "no device to read from");
- bio_endio(&rbio->bio);
- return;
+ return ret;
}
- if (!pick.ca)
- goto err;
+no_device_postclone:
+ if (!rbio->split)
+ rbio->bio.bi_end_io = rbio->end_io;
+ bch2_rbio_free(rbio);
+no_device:
+ __bcache_io_error(c, "no device to read from");
- if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) {
- percpu_ref_put(&pick.ca->io_ref);
- goto err;
+ if (likely(!(flags & BCH_READ_IN_RETRY))) {
+ orig->bio.bi_status = BLK_STS_IOERR;
+ if (flags & BCH_READ_LAST_FRAGMENT)
+ bio_endio(&orig->bio);
+ return 0;
+ } else {
+ return READ_ERR;
}
- ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k),
- &pick, flags);
- switch (ret) {
- case READ_RETRY_AVOID:
- __set_bit(pick.ca->dev_idx, avoid->d);
- case READ_RETRY:
- goto retry;
- case READ_ERR:
- bio_endio(&rbio->bio);
- return;
- };
-
- return;
-err:
+hole:
/*
- * extent we wanted to read no longer exists, or
- * was merged or partially overwritten (and thus
- * possibly bigger than the memory that was
- * originally allocated)
+ * won't normally happen in the BCH_READ_NODECODE
+ * (bch2_move_extent()) path, but if we retry and the extent we wanted
+ * to read no longer exists we have to signal that:
*/
- rbio->bio.bi_status = BLK_STS_AGAIN;
- bio_endio(&rbio->bio);
- return;
+ if (flags & BCH_READ_NODECODE)
+ orig->hole = true;
+
+ zero_fill_bio_iter(&orig->bio, iter);
+
+ if (flags & BCH_READ_LAST_FRAGMENT)
+ bio_endio(&orig->bio);
+ return 0;
}
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
- struct bch_devs_mask *avoid, unsigned flags)
+void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
{
struct btree_iter iter;
struct bkey_s_c k;
+ unsigned flags = BCH_READ_RETRY_IF_STALE|
+ BCH_READ_MAY_PROMOTE|
+ BCH_READ_USER_MAPPED;
int ret;
- EBUG_ON(flags & BCH_READ_NODECODE);
-retry:
+ BUG_ON(rbio->_state);
+ BUG_ON(flags & BCH_READ_NODECODE);
+ BUG_ON(flags & BCH_READ_IN_RETRY);
+
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
- POS(inode, bvec_iter.bi_sector),
+ POS(inode, rbio->bio.bi_iter.bi_sector),
BTREE_ITER_SLOTS, k) {
BKEY_PADDED(k) tmp;
- struct extent_pick_ptr pick;
- struct bvec_iter fragment;
+ unsigned bytes;
/*
* Unlock the iterator while the btree node's lock is still in
@@ -1513,49 +1611,20 @@ retry:
k = bkey_i_to_s_c(&tmp.k);
bch2_btree_iter_unlock(&iter);
- bch2_extent_pick_ptr(c, k, avoid, &pick);
- if (IS_ERR(pick.ca)) {
- bcache_io_error(c, &rbio->bio, "no device to read from");
- bio_endio(&rbio->bio);
- return;
- }
+ bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
+ (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
+ swap(rbio->bio.bi_iter.bi_size, bytes);
- fragment = bvec_iter;
- fragment.bi_size = (min_t(u64, k.k->p.offset,
- bvec_iter_end_sector(bvec_iter)) -
- bvec_iter.bi_sector) << 9;
+ if (rbio->bio.bi_iter.bi_size == bytes)
+ flags |= BCH_READ_LAST_FRAGMENT;
- if (pick.ca) {
- if (fragment.bi_size != bvec_iter.bi_size) {
- bio_inc_remaining(&rbio->bio);
- flags |= BCH_READ_MUST_CLONE;
- trace_read_split(&rbio->bio);
- }
-
- ret = __bch2_read_extent(c, rbio, fragment,
- bkey_s_c_to_extent(k),
- &pick, flags);
- switch (ret) {
- case READ_RETRY_AVOID:
- __set_bit(pick.ca->dev_idx, avoid->d);
- case READ_RETRY:
- goto retry;
- case READ_ERR:
- rbio->bio.bi_status = BLK_STS_IOERR;
- bio_endio(&rbio->bio);
- return;
- };
- } else {
- zero_fill_bio_iter(&rbio->bio, fragment);
-
- if (fragment.bi_size == bvec_iter.bi_size)
- bio_endio(&rbio->bio);
- }
+ bch2_read_extent(c, rbio, k, flags);
- if (fragment.bi_size == bvec_iter.bi_size)
+ if (flags & BCH_READ_LAST_FRAGMENT)
return;
- bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size);
+ swap(rbio->bio.bi_iter.bi_size, bytes);
+ bio_advance(&rbio->bio, bytes);
}
/*
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index a0c795abe9bd..ac7becbd5f21 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -99,40 +99,28 @@ struct cache_promote_op;
struct extent_pick_ptr;
int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- struct bkey_s_c_extent e, struct extent_pick_ptr *,
- unsigned);
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- u64, struct bch_devs_mask *, unsigned);
+ struct bkey_s_c, struct bch_devs_mask *, unsigned);
+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
enum bch_read_flags {
BCH_READ_RETRY_IF_STALE = 1 << 0,
BCH_READ_MAY_PROMOTE = 1 << 1,
BCH_READ_USER_MAPPED = 1 << 2,
BCH_READ_NODECODE = 1 << 3,
+ BCH_READ_LAST_FRAGMENT = 1 << 4,
/* internal: */
- BCH_READ_MUST_BOUNCE = 1 << 4,
- BCH_READ_MUST_CLONE = 1 << 5,
- BCH_READ_IN_RETRY = 1 << 6,
+ BCH_READ_MUST_BOUNCE = 1 << 5,
+ BCH_READ_MUST_CLONE = 1 << 6,
+ BCH_READ_IN_RETRY = 1 << 7,
};
static inline void bch2_read_extent(struct bch_fs *c,
struct bch_read_bio *rbio,
- struct bkey_s_c_extent e,
- struct extent_pick_ptr *pick,
+ struct bkey_s_c k,
unsigned flags)
{
- __bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
-}
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- u64 inode)
-{
- BUG_ON(rbio->_state);
- __bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
- BCH_READ_RETRY_IF_STALE|
- BCH_READ_MAY_PROMOTE|
- BCH_READ_USER_MAPPED);
+ __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
}
static inline struct bch_read_bio *rbio_init(struct bio *bio,
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index a022ab335428..f114a8c0aff8 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -36,16 +36,18 @@ struct bch_read_bio {
struct bvec_iter bvec_iter;
unsigned submit_time_us;
- u8 flags;
+ u16 flags;
union {
struct {
- u8 bounce:1,
+ u16 bounce:1,
split:1,
+ have_ioref:1,
narrow_crcs:1,
+ hole:1,
retry:2,
context:2;
};
- u8 _state;
+ u16 _state;
};
struct bch_devs_list devs_have;
@@ -66,16 +68,16 @@ struct bch_read_bio {
struct bch_write_bio {
struct bch_fs *c;
- struct bch_dev *ca;
struct bch_write_bio *parent;
struct bch_devs_list failed;
u8 order;
+ u8 dev;
unsigned split:1,
bounce:1,
put_bio:1,
- have_io_ref:1,
+ have_ioref:1,
used_mempool:1;
unsigned submit_time_us;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 38235d646a94..dbb7dd2a5f9b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -22,7 +22,6 @@ struct moving_io {
struct closure cl;
bool read_completed;
- unsigned read_dev;
unsigned read_sectors;
unsigned write_sectors;
@@ -42,7 +41,7 @@ struct moving_context {
struct list_head reads;
/* in flight sectors: */
- atomic_t read_sectors[BCH_SB_MEMBERS_MAX];
+ atomic_t read_sectors;
atomic_t write_sectors;
wait_queue_head_t wait;
@@ -306,7 +305,7 @@ static void move_write(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
- if (unlikely(io->rbio.bio.bi_status)) {
+ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
closure_return_with_destructor(cl, move_free);
return;
}
@@ -331,7 +330,7 @@ static void move_read_endio(struct bio *bio)
struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
struct moving_context *ctxt = io->write.ctxt;
- atomic_sub(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
+ atomic_sub(io->read_sectors, &ctxt->read_sectors);
io->read_completed = true;
if (next_pending_write(ctxt))
@@ -377,7 +376,6 @@ static int bch2_move_extent(struct bch_fs *c,
enum data_cmd data_cmd,
struct data_opts data_opts)
{
- struct extent_pick_ptr pick;
struct moving_io *io;
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
@@ -388,12 +386,8 @@ static int bch2_move_extent(struct bch_fs *c,
atomic_read(&ctxt->write_sectors) <
SECTORS_IN_FLIGHT_PER_DEVICE);
- bch2_extent_pick_ptr(c, e.s_c, NULL, &pick);
- if (IS_ERR_OR_NULL(pick.ca))
- return pick.ca ? PTR_ERR(pick.ca) : 0;
-
move_ctxt_wait_event(ctxt,
- atomic_read(&ctxt->read_sectors[pick.ca->dev_idx]) <
+ atomic_read(&ctxt->read_sectors) <
SECTORS_IN_FLIGHT_PER_DEVICE);
/* write path might have to decompress data: */
@@ -407,8 +401,7 @@ static int bch2_move_extent(struct bch_fs *c,
goto err;
io->write.ctxt = ctxt;
- io->read_dev = pick.ca->dev_idx;
- io->read_sectors = pick.crc.uncompressed_size;
+ io->read_sectors = e.k->size;
io->write_sectors = e.k->size;
bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
@@ -422,6 +415,7 @@ static int bch2_move_extent(struct bch_fs *c,
io->rbio.opts = io_opts;
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
+ io->rbio.bio.bi_vcnt = pages;
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
@@ -439,7 +433,7 @@ static int bch2_move_extent(struct bch_fs *c,
trace_move_extent(e.k);
- atomic_add(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
+ atomic_add(io->read_sectors, &ctxt->read_sectors);
list_add_tail(&io->list, &ctxt->reads);
/*
@@ -447,14 +441,15 @@ static int bch2_move_extent(struct bch_fs *c,
* ctxt when doing wakeup
*/
closure_get(&ctxt->cl);
- bch2_read_extent(c, &io->rbio, e, &pick, BCH_READ_NODECODE);
+ bch2_read_extent(c, &io->rbio, e.s_c,
+ BCH_READ_NODECODE|
+ BCH_READ_LAST_FRAGMENT);
return 0;
err_free_pages:
bio_free_pages(&io->write.op.wbio.bio);
err_free:
kfree(io);
err:
- percpu_ref_put(&pick.ca->io_ref);
trace_move_alloc_fail(e.k);
return ret;
}
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index a52ee3bb37ee..231bc5295740 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -27,7 +27,26 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
static inline bool bch2_dev_is_online(struct bch_dev *ca)
{
- return ca->disk_sb.bdev != NULL;
+ return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+ return bch2_dev_is_online(ca) &&
+ ca->mi.state != BCH_MEMBER_STATE_FAILED;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+ if (!percpu_ref_tryget(&ca->io_ref))
+ return false;
+
+ if (ca->mi.state == BCH_MEMBER_STATE_RW ||
+ (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
+ return true;
+
+ percpu_ref_put(&ca->io_ref);
+ return false;
}
static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)