diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2018-03-08 17:49:16 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2018-05-22 00:44:18 -0400 |
commit | b5189c85527cd13ca46640bf3206678346154118 (patch) | |
tree | 8f28962ff91c504bf7c06bdd8f669adbb441d80a | |
parent | 9ac260ea3f7ccb646eb635a41270763116329d06 (diff) |
bcachefs: Centralize error handling in read path
prep work for erasure coding - need to plumb in reconstruct reads
-rw-r--r-- | fs/bcachefs/btree_io.c | 98 | ||||
-rw-r--r-- | fs/bcachefs/btree_io.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/debug.c | 12 | ||||
-rw-r--r-- | fs/bcachefs/extents.c | 85 | ||||
-rw-r--r-- | fs/bcachefs/extents.h | 12 | ||||
-rw-r--r-- | fs/bcachefs/extents_types.h | 1 | ||||
-rw-r--r-- | fs/bcachefs/fs-io.c | 48 | ||||
-rw-r--r-- | fs/bcachefs/io.c | 493 | ||||
-rw-r--r-- | fs/bcachefs/io.h | 28 | ||||
-rw-r--r-- | fs/bcachefs/io_types.h | 12 | ||||
-rw-r--r-- | fs/bcachefs/move.c | 25 | ||||
-rw-r--r-- | fs/bcachefs/super.h | 21 |
12 files changed, 460 insertions, 377 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 0525c3b87f95..42190f05a0c1 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1323,37 +1323,48 @@ static void btree_node_read_work(struct work_struct *work) struct btree_read_bio *rb = container_of(work, struct btree_read_bio, work); struct bch_fs *c = rb->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); struct btree *b = rb->bio.bi_private; struct bio *bio = &rb->bio; struct bch_devs_mask avoid; + bool can_retry; memset(&avoid, 0, sizeof(avoid)); goto start; - do { + while (1) { bch_info(c, "retrying read"); + ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); bio_reset(bio); - bio_set_dev(bio, rb->pick.ca->disk_sb.bdev); bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_bytes(c); - submit_bio_wait(bio); + + if (rb->have_ioref) { + bio_set_dev(bio, ca->disk_sb.bdev); + submit_bio_wait(bio); + } else { + bio->bi_status = BLK_STS_REMOVED; + } start: - bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read"); - percpu_ref_put(&rb->pick.ca->io_ref); + bch2_dev_io_err_on(bio->bi_status, ca, "btree read"); + if (rb->have_ioref) + percpu_ref_put(&ca->io_ref); + rb->have_ioref = false; - __set_bit(rb->pick.ca->dev_idx, avoid.d); - rb->pick = bch2_btree_pick_ptr(c, b, &avoid); + __set_bit(rb->pick.ptr.dev, avoid.d); + can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0; if (!bio->bi_status && - !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca))) - goto out; - } while (!IS_ERR_OR_NULL(rb->pick.ca)); + !bch2_btree_node_read_done(c, b, can_retry)) + break; - set_btree_node_read_error(b); -out: - if (!IS_ERR_OR_NULL(rb->pick.ca)) - percpu_ref_put(&rb->pick.ca->io_ref); + if (!can_retry) { + set_btree_node_read_error(b); + break; + } + } bch2_time_stats_update(&c->btree_read_time, rb->start_time); bio_put(&rb->bio); @@ -1365,10 +1376,13 @@ static void btree_node_read_endio(struct bio *bio) { struct btree_read_bio *rb = container_of(bio, struct btree_read_bio, bio); + struct bch_fs *c = rb->c; - bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ); + if (rb->have_ioref) { + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + bch2_latency_acct(ca, rb->start_time >> 10, READ); + } - INIT_WORK(&rb->work, btree_node_read_work); queue_work(system_unbound_wq, &rb->work); } @@ -1377,41 +1391,58 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, { struct extent_pick_ptr pick; struct btree_read_bio *rb; + struct bch_dev *ca; struct bio *bio; + int ret; trace_btree_read(c, b); - pick = bch2_btree_pick_ptr(c, b, NULL); - if (bch2_fs_fatal_err_on(!pick.ca, c, + ret = bch2_btree_pick_ptr(c, b, NULL, &pick); + if (bch2_fs_fatal_err_on(ret <= 0, c, "btree node read error: no device to read from")) { set_btree_node_read_error(b); return; } + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio); rb = container_of(bio, struct btree_read_bio, bio); rb->c = c; rb->start_time = local_clock(); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); rb->pick = pick; - bio_set_dev(bio, pick.ca->disk_sb.bdev); + INIT_WORK(&rb->work, btree_node_read_work); bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_iter.bi_size = btree_bytes(c); + bio->bi_end_io = btree_node_read_endio; + bio->bi_private = b; bch2_bio_map(bio, b->data); - this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE], - bio_sectors(bio)); - set_btree_node_read_in_flight(b); - if (sync) { - submit_bio_wait(bio); - bio->bi_private = b; - btree_node_read_work(&rb->work); + if (rb->have_ioref) { + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE], + bio_sectors(bio)); + bio_set_dev(bio, ca->disk_sb.bdev); + + if (sync) { + submit_bio_wait(bio); + + bio->bi_private = b; + btree_node_read_work(&rb->work); + } else { + submit_bio(bio); + } } else { - bio->bi_end_io = btree_node_read_endio; - bio->bi_private = b; - submit_bio(bio); + bio->bi_status = BLK_STS_REMOVED; + + if (sync) + btree_node_read_work(&rb->work); + else + queue_work(system_unbound_wq, &rb->work); + } } @@ -1593,20 +1624,21 @@ static void btree_node_write_endio(struct bio *bio) struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_write_bio *orig = parent ?: wbio; struct bch_fs *c = wbio->c; - struct bch_dev *ca = wbio->ca; + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); unsigned long flags; - bch2_latency_acct(ca, wbio->submit_time_us, WRITE); + if (wbio->have_ioref) + bch2_latency_acct(ca, wbio->submit_time_us, WRITE); if (bio->bi_status == BLK_STS_REMOVED || bch2_dev_io_err_on(bio->bi_status, ca, "btree write") || bch2_meta_write_fault("btree")) { spin_lock_irqsave(&c->btree_write_error_lock, flags); - bch2_dev_list_add_dev(&orig->failed, ca->dev_idx); + bch2_dev_list_add_dev(&orig->failed, wbio->dev); spin_unlock_irqrestore(&c->btree_write_error_lock, flags); } - if (wbio->have_io_ref) + if (wbio->have_ioref) percpu_ref_put(&ca->io_ref); if (parent) { diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 01df817d3eeb..947685f925b1 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -12,8 +12,8 @@ struct btree_iter; struct btree_read_bio { struct bch_fs *c; - unsigned submit_time_us; u64 start_time; + unsigned have_ioref:1; struct extent_pick_ptr pick; struct work_struct work; struct bio bio; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 7190990dbfa1..71f649bc4c7f 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -36,6 +36,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) struct btree_node *n_ondisk, *n_sorted, *n_inmemory; struct bset *sorted, *inmemory; struct extent_pick_ptr pick; + struct bch_dev *ca; struct bio *bio; if (c->opts.nochanges) @@ -54,12 +55,15 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) v->btree_id = b->btree_id; bch2_btree_keys_init(v, &c->expensive_debug_checks); - pick = bch2_btree_pick_ptr(c, b, NULL); - if (IS_ERR_OR_NULL(pick.ca)) + if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0) + return; + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + if (!bch2_dev_get_ioref(ca, READ)) return; bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio); - bio_set_dev(bio, pick.ca->disk_sb.bdev); + bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_opf = REQ_OP_READ|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_iter.bi_size = btree_bytes(c); @@ -68,7 +72,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) submit_bio_wait(bio); bio_put(bio); - percpu_ref_put(&pick.ca->io_ref); + percpu_ref_put(&ca->io_ref); memcpy(n_ondisk, n_sorted, btree_bytes(c)); diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index c5d1e7cb539b..c6f969c4cfae 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -588,9 +588,12 @@ out: return out - buf; } -static inline bool dev_latency_better(struct bch_dev *dev1, - struct bch_dev *dev2) +static inline bool dev_latency_better(struct bch_fs *c, + const struct bch_extent_ptr *ptr1, + const struct bch_extent_ptr *ptr2) { + struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev); + struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev); unsigned l1 = atomic_read(&dev1->latency[READ]); unsigned l2 = atomic_read(&dev2->latency[READ]); @@ -599,47 +602,37 @@ static inline bool dev_latency_better(struct bch_dev *dev1, return bch2_rand_range(l1 + l2) > l1; } -static void extent_pick_read_device(struct bch_fs *c, - struct bkey_s_c_extent e, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *pick) +static int extent_pick_read_device(struct bch_fs *c, + struct bkey_s_c_extent e, + struct bch_devs_mask *avoid, + struct extent_pick_ptr *pick) { const struct bch_extent_ptr *ptr; struct bch_extent_crc_unpacked crc; + struct bch_dev *ca; + int ret = 0; extent_for_each_ptr_crc(e, ptr, crc) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + ca = bch_dev_bkey_exists(c, ptr->dev); if (ptr->cached && ptr_stale(ca, ptr)) continue; - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) + if (avoid && test_bit(ptr->dev, avoid->d)) continue; - if (avoid) { - if (test_bit(ca->dev_idx, avoid->d)) - continue; - - if (pick->ca && - test_bit(pick->ca->dev_idx, avoid->d)) - goto use; - } - - if (pick->ca && !dev_latency_better(ca, pick->ca)) - continue; -use: - if (!percpu_ref_tryget(&ca->io_ref)) + if (ret && !dev_latency_better(c, ptr, &pick->ptr)) continue; - if (pick->ca) - percpu_ref_put(&pick->ca->io_ref); - *pick = (struct extent_pick_ptr) { .ptr = *ptr, .crc = crc, - .ca = ca, }; + + ret = 1; } + + return ret; } /* Btree ptrs */ @@ -759,16 +752,12 @@ void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, #undef p } -struct extent_pick_ptr -bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b, - struct bch_devs_mask *avoid) +int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b, + struct bch_devs_mask *avoid, + struct extent_pick_ptr *pick) { - struct extent_pick_ptr pick = { .ca = NULL }; - - extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key), - avoid, &pick); - - return pick; + return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key), + avoid, pick); } /* Extents */ @@ -2057,37 +2046,33 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to * other devices, it will still pick a pointer from avoid. */ -void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *ret) +int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_devs_mask *avoid, + struct extent_pick_ptr *pick) { - struct bkey_s_c_extent e; + int ret; switch (k.k->type) { case KEY_TYPE_DELETED: case KEY_TYPE_DISCARD: case KEY_TYPE_COOKIE: - ret->ca = NULL; - return; + return 0; case KEY_TYPE_ERROR: - ret->ca = ERR_PTR(-EIO); - return; + return -EIO; case BCH_EXTENT: case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); - ret->ca = NULL; + ret = extent_pick_read_device(c, bkey_s_c_to_extent(k), + avoid, pick); - extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret); + if (!ret && !bkey_extent_is_cached(k.k)) + ret = -EIO; - if (!ret->ca && !bkey_extent_is_cached(e.k)) - ret->ca = ERR_PTR(-EIO); - return; + return ret; case BCH_RESERVATION: - ret->ca = NULL; - return; + return 0; default: BUG(); diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 8dc15484f485..338e9e01cf5d 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -53,13 +53,13 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct btree *, struct btree_node_iter_large *); -struct extent_pick_ptr -bch2_btree_pick_ptr(struct bch_fs *, const struct btree *, - struct bch_devs_mask *avoid); +int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *, + struct bch_devs_mask *avoid, + struct extent_pick_ptr *); -void bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c, - struct bch_devs_mask *, - struct extent_pick_ptr *); +int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c, + struct bch_devs_mask *, + struct extent_pick_ptr *); enum btree_insert_ret bch2_insert_fixup_extent(struct btree_insert *, diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h index 15805cd29ddb..76139f931fe0 100644 --- a/fs/bcachefs/extents_types.h +++ b/fs/bcachefs/extents_types.h @@ -21,7 +21,6 @@ struct bch_extent_crc_unpacked { struct extent_pick_ptr { struct bch_extent_ptr ptr; struct bch_extent_crc_unpacked crc; - struct bch_dev *ca; }; #endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 86e1ba9042af..e2855743540a 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -993,11 +993,9 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, BCH_READ_MAY_PROMOTE; while (1) { - struct extent_pick_ptr pick; BKEY_PADDED(k) tmp; struct bkey_s_c k; unsigned bytes; - bool is_last; bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector)); @@ -1016,45 +1014,37 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, bch2_btree_iter_unlock(iter); k = bkey_i_to_s_c(&tmp.k); - bch2_extent_pick_ptr(c, k, NULL, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, bio, "no device to read from"); - bio_endio(bio); - return; - } + if (readpages_iter) { + bool want_full_extent = false; + + if (bkey_extent_is_data(k.k)) { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + + extent_for_each_ptr_crc(e, ptr, crc) + want_full_extent |= !!crc.csum_type | + !!crc.compression_type; + } - if (readpages_iter) readpage_bio_extend(readpages_iter, bio, k.k->p.offset, - pick.ca && - (pick.crc.csum_type || - pick.crc.compression_type)); + want_full_extent); + } bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) - bio->bi_iter.bi_sector) << 9; - is_last = bytes == bio->bi_iter.bi_size; swap(bio->bi_iter.bi_size, bytes); + if (bytes == bio->bi_iter.bi_size) + flags |= BCH_READ_LAST_FRAGMENT; + if (bkey_extent_is_allocation(k.k)) bch2_add_page_sectors(bio, k); - if (pick.ca) { - if (!is_last) { - bio_inc_remaining(&rbio->bio); - flags |= BCH_READ_MUST_CLONE; - trace_read_split(&rbio->bio); - } - - bch2_read_extent(c, rbio, bkey_s_c_to_extent(k), - &pick, flags); - } else { - zero_fill_bio(bio); - - if (is_last) - bio_endio(bio); - } + bch2_read_extent(c, rbio, k, flags); - if (is_last) + if (flags & BCH_READ_LAST_FRAGMENT) return; swap(bio->bi_iter.bi_size, bytes); diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 308ba4ddbe1b..e0fb40bc6381 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -169,22 +169,21 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, } n->c = c; - n->ca = ca; + n->dev = ptr->dev; + n->have_ioref = bch2_dev_get_ioref(ca, WRITE); n->submit_time_us = local_clock_us(); n->bio.bi_iter.bi_sector = ptr->offset; if (!journal_flushes_device(ca)) n->bio.bi_opf |= REQ_FUA; - if (likely(percpu_ref_tryget(&ca->io_ref))) { + if (likely(n->have_ioref)) { this_cpu_add(ca->io_done->sectors[WRITE][type], bio_sectors(&n->bio)); - n->have_io_ref = true; bio_set_dev(&n->bio, ca->disk_sb.bdev); submit_bio(&n->bio); } else { - n->have_io_ref = false; n->bio.bi_status = BLK_STS_REMOVED; bio_endio(&n->bio); } @@ -318,15 +317,15 @@ static void bch2_write_endio(struct bio *bio) struct bch_write_bio *wbio = to_wbio(bio); struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_fs *c = wbio->c; - struct bch_dev *ca = wbio->ca; - - bch2_latency_acct(ca, wbio->submit_time_us, WRITE); + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); if (bch2_dev_io_err_on(bio->bi_status, ca, "data write")) - set_bit(ca->dev_idx, op->failed.d); + set_bit(wbio->dev, op->failed.d); - if (wbio->have_io_ref) + if (wbio->have_ioref) { + bch2_latency_acct(ca, wbio->submit_time_us, WRITE); percpu_ref_put(&ca->io_ref); + } if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -931,7 +930,7 @@ static struct promote_op *promote_alloc(struct bch_read_bio *rbio, return op; } -static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e, +static bool should_promote(struct bch_fs *c, struct bkey_s_c k, unsigned flags, u16 target) { if (!target) @@ -943,15 +942,14 @@ static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e, if (percpu_ref_is_dying(&c->writes)) return false; - return bch2_extent_has_target(c, e, target) == NULL; + if (!bkey_extent_is_data(k.k)) + return false; + + return bch2_extent_has_target(c, bkey_s_c_to_extent(k), target) == NULL; } /* Read */ -static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *, - struct bvec_iter, u64, - struct bch_devs_mask *, unsigned); - #define READ_RETRY_AVOID 1 #define READ_RETRY 2 #define READ_ERR 3 @@ -984,27 +982,123 @@ static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) { - struct bch_read_bio *parent = rbio->parent; - - BUG_ON(!rbio->split); + BUG_ON(rbio->bounce && !rbio->split); if (rbio->promote) kfree(rbio->promote); + rbio->promote = NULL; + if (rbio->bounce) bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - bio_put(&rbio->bio); - return parent; + if (rbio->split) { + struct bch_read_bio *parent = rbio->parent; + + bio_put(&rbio->bio); + rbio = parent; + } + + return rbio; } -static void bch2_rbio_done(struct bch_read_bio *rbio) +static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, u64 inode, + struct bch_devs_mask *avoid, unsigned flags) { - if (rbio->promote) - kfree(rbio->promote); - rbio->promote = NULL; + struct btree_iter iter; + BKEY_PADDED(k) tmp; + struct bkey_s_c k; + int ret; - if (rbio->split) - rbio = bch2_rbio_free(rbio); + flags &= ~BCH_READ_LAST_FRAGMENT; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, + rbio->pos, BTREE_ITER_SLOTS); +retry: + rbio->bio.bi_status = 0; + + k = bch2_btree_iter_peek_slot(&iter); + if (btree_iter_err(k)) { + bch2_btree_iter_unlock(&iter); + goto err; + } + + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + bch2_btree_iter_unlock(&iter); + + if (!bkey_extent_is_data(k.k) || + !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k), + rbio->pick.ptr, + rbio->pos.offset - + rbio->pick.crc.offset)) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; + goto out; + } + + ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags); + if (ret == READ_RETRY) + goto retry; + if (ret) + goto err; + goto out; +err: + rbio->bio.bi_status = BLK_STS_IOERR; +out: + bio_endio(&rbio->bio); +} + +static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, u64 inode, + struct bch_devs_mask *avoid, unsigned flags) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; +retry: + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + POS(inode, bvec_iter.bi_sector), + BTREE_ITER_SLOTS, k) { + BKEY_PADDED(k) tmp; + unsigned bytes; + + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + bch2_btree_iter_unlock(&iter); + + bytes = min_t(unsigned, bvec_iter.bi_size, + (k.k->p.offset - bvec_iter.bi_sector) << 9); + swap(bvec_iter.bi_size, bytes); + + ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags); + switch (ret) { + case READ_RETRY: + goto retry; + case READ_ERR: + goto err; + }; + + if (bytes == bvec_iter.bi_size) + goto out; + + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + } + + /* + * If we get here, it better have been because there was an error + * reading a btree node + */ + ret = bch2_btree_iter_unlock(&iter); + BUG_ON(!ret); + __bcache_io_error(c, "btree IO error %i", ret); +err: + rbio->bio.bi_status = BLK_STS_IOERR; +out: bio_endio(&rbio->bio); } @@ -1012,10 +1106,10 @@ static void bch2_rbio_retry(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bvec_iter iter = rbio->bvec_iter; - unsigned flags = rbio->flags; - u64 inode = rbio->pos.inode; + struct bch_fs *c = rbio->c; + struct bvec_iter iter = rbio->bvec_iter; + unsigned flags = rbio->flags; + u64 inode = rbio->pos.inode; struct bch_devs_mask avoid; trace_read_retry(&rbio->bio); @@ -1023,26 +1117,19 @@ static void bch2_rbio_retry(struct work_struct *work) memset(&avoid, 0, sizeof(avoid)); if (rbio->retry == READ_RETRY_AVOID) - __set_bit(rbio->pick.ca->dev_idx, avoid.d); + __set_bit(rbio->pick.ptr.dev, avoid.d); - if (rbio->promote) - kfree(rbio->promote); - rbio->promote = NULL; + rbio->bio.bi_status = 0; - if (rbio->split) - rbio = bch2_rbio_free(rbio); - else - rbio->bio.bi_status = 0; + rbio = bch2_rbio_free(rbio); - if (!(flags & BCH_READ_NODECODE)) - flags |= BCH_READ_MUST_CLONE; flags |= BCH_READ_IN_RETRY; flags &= ~BCH_READ_MAY_PROMOTE; if (flags & BCH_READ_NODECODE) - bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags); + bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags); else - __bch2_read(c, rbio, iter, inode, &avoid, flags); + bch2_read_retry(c, rbio, iter, inode, &avoid, flags); } static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, @@ -1054,8 +1141,10 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, return; if (retry == READ_ERR) { - bch2_rbio_parent(rbio)->bio.bi_status = error; - bch2_rbio_done(rbio); + rbio = bch2_rbio_free(rbio); + + rbio->bio.bi_status = error; + bio_endio(&rbio->bio); } else { bch2_rbio_punt(rbio, bch2_rbio_retry, RBIO_CONTEXT_UNBOUND, system_unbound_wq); @@ -1126,12 +1215,13 @@ out: bch2_btree_iter_unlock(&iter); } -static bool should_narrow_crcs(struct bkey_s_c_extent e, +static bool should_narrow_crcs(struct bkey_s_c k, struct extent_pick_ptr *pick, unsigned flags) { return !(flags & BCH_READ_IN_RETRY) && - bch2_can_narrow_extent_crcs(e, pick->crc); + bkey_extent_is_data(k.k) && + bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc); } /* Inner part that may run in process context */ @@ -1139,8 +1229,10 @@ static void __bch2_read_endio(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio; + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bio *src = &rbio->bio; + struct bio *dst = &bch2_rbio_parent(rbio)->bio; struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); @@ -1198,8 +1290,10 @@ static void __bch2_read_endio(struct work_struct *work) promote_start(rbio->promote, rbio); } nodecode: - if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) - bch2_rbio_done(rbio); + if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + rbio = bch2_rbio_free(rbio); + bio_endio(&rbio->bio); + } return; csum_err: /* @@ -1213,7 +1307,7 @@ csum_err: return; } - bch2_dev_io_error(rbio->pick.ca, + bch2_dev_io_error(ca, "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)", rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, @@ -1232,25 +1326,27 @@ static void bch2_read_endio(struct bio *bio) { struct bch_read_bio *rbio = container_of(bio, struct bch_read_bio, bio); - struct bch_fs *c = rbio->c; + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; - bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ); - - percpu_ref_put(&rbio->pick.ca->io_ref); + if (rbio->have_ioref) { + bch2_latency_acct(ca, rbio->submit_time_us, READ); + percpu_ref_put(&ca->io_ref); + } if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) { + if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) { bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } if (rbio->pick.ptr.cached && (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) { + ptr_stale(ca, &rbio->pick.ptr))) { atomic_long_inc(&c->read_realloc_races); if (rbio->flags & BCH_READ_RETRY_IF_STALE) @@ -1271,72 +1367,92 @@ static void bch2_read_endio(struct bio *bio) } int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, - struct bvec_iter iter, struct bkey_s_c_extent e, - struct extent_pick_ptr *pick, unsigned flags) + struct bvec_iter iter, struct bkey_s_c k, + struct bch_devs_mask *avoid, unsigned flags) { + struct extent_pick_ptr pick; struct bch_read_bio *rbio; + struct bch_dev *ca; bool split = false, bounce = false, read_full = false; bool promote = false, narrow_crcs = false; - struct bpos pos = bkey_start_pos(e.k); - int ret = 0; + struct bpos pos = bkey_start_pos(k.k); + int pick_ret; - lg_local_lock(&c->usage_lock); - bucket_io_clock_reset(c, pick->ca, - PTR_BUCKET_NR(pick->ca, &pick->ptr), READ); - lg_local_unlock(&c->usage_lock); + pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick); + + /* hole or reservation - just zero fill: */ + if (!pick_ret) + goto hole; + + if (pick_ret < 0) + goto no_device; - narrow_crcs = should_narrow_crcs(e, pick, flags); + if (pick_ret > 0) + ca = bch_dev_bkey_exists(c, pick.ptr.dev); if (flags & BCH_READ_NODECODE) { - BUG_ON(iter.bi_size < pick->crc.compressed_size << 9); - iter.bi_size = pick->crc.compressed_size << 9; + /* + * can happen if we retry, and the extent we were going to read + * has been merged in the meantime: + */ + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + goto hole; + + iter.bi_sector = pos.offset; + iter.bi_size = pick.crc.compressed_size << 9; goto noclone; } + if (!(flags & BCH_READ_LAST_FRAGMENT) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_MUST_CLONE; + + narrow_crcs = should_narrow_crcs(k, &pick, flags); + if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) flags |= BCH_READ_MUST_BOUNCE; - EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector || - e.k->p.offset < bvec_iter_end_sector(iter)); + EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector || + k.k->p.offset < bvec_iter_end_sector(iter)); - if (pick->crc.compression_type != BCH_COMPRESSION_NONE || - (pick->crc.csum_type != BCH_CSUM_NONE && - (bvec_iter_sectors(iter) != pick->crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick->crc.csum_type) && + if (pick.crc.compression_type != BCH_COMPRESSION_NONE || + (pick.crc.csum_type != BCH_CSUM_NONE && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && (flags & BCH_READ_USER_MAPPED)) || (flags & BCH_READ_MUST_BOUNCE)))) { read_full = true; bounce = true; } - promote = should_promote(c, e, flags, orig->opts.promote_target); + promote = should_promote(c, k, flags, orig->opts.promote_target); /* could also set read_full */ if (promote) bounce = true; if (!read_full) { - EBUG_ON(pick->crc.compression_type); - EBUG_ON(pick->crc.csum_type && - (bvec_iter_sectors(iter) != pick->crc.uncompressed_size || - bvec_iter_sectors(iter) != pick->crc.live_size || - pick->crc.offset || + EBUG_ON(pick.crc.compression_type); + EBUG_ON(pick.crc.csum_type && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + bvec_iter_sectors(iter) != pick.crc.live_size || + pick.crc.offset || iter.bi_sector != pos.offset)); - pick->ptr.offset += pick->crc.offset + + pick.ptr.offset += pick.crc.offset + (iter.bi_sector - pos.offset); - pick->crc.compressed_size = bvec_iter_sectors(iter); - pick->crc.uncompressed_size = bvec_iter_sectors(iter); - pick->crc.offset = 0; - pick->crc.live_size = bvec_iter_sectors(iter); + pick.crc.compressed_size = bvec_iter_sectors(iter); + pick.crc.uncompressed_size = bvec_iter_sectors(iter); + pick.crc.offset = 0; + pick.crc.live_size = bvec_iter_sectors(iter); pos.offset = iter.bi_sector; } if (bounce) { - unsigned sectors = pick->crc.compressed_size; + unsigned sectors = pick.crc.compressed_size; rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - &c->bio_read_split), + DIV_ROUND_UP(sectors, PAGE_SECTORS), + &c->bio_read_split), orig->opts); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); @@ -1363,7 +1479,7 @@ noclone: BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); } - BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size); + BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); rbio->c = c; if (split) @@ -1375,135 +1491,117 @@ noclone: rbio->flags = flags; rbio->bounce = bounce; rbio->split = split; + rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); rbio->narrow_crcs = narrow_crcs; + rbio->hole = 0; rbio->retry = 0; rbio->context = 0; - rbio->devs_have = bch2_extent_devs(e); - rbio->pick = *pick; + rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; rbio->pos = pos; - rbio->version = e.k->version; - rbio->promote = promote ? promote_alloc(rbio, e.s_c) : NULL; + rbio->version = k.k->version; + rbio->promote = promote ? promote_alloc(rbio, k) : NULL; INIT_WORK(&rbio->work, NULL); - bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev); rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick->ptr.offset; + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; if (bounce) trace_read_bounce(&rbio->bio); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER], + + if (!rbio->have_ioref) + goto no_device_postclone; + + lg_local_lock(&c->usage_lock); + bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); + lg_local_unlock(&c->usage_lock); + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + if (likely(!(flags & BCH_READ_IN_RETRY))) { + if (!(flags & BCH_READ_LAST_FRAGMENT)) { + bio_inc_remaining(&orig->bio); + trace_read_split(&orig->bio); + } + submit_bio(&rbio->bio); + return 0; } else { + int ret; + submit_bio_wait(&rbio->bio); rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); ret = rbio->retry; - if (rbio->split) - rbio = bch2_rbio_free(rbio); - if (!ret) - bch2_rbio_done(rbio); - } - - return ret; -} - -static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) -{ - struct extent_pick_ptr pick; - struct btree_iter iter; - BKEY_PADDED(k) tmp; - struct bkey_s_c k; - int ret; - - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS); -retry: - k = bch2_btree_iter_peek_slot(&iter); - if (btree_iter_err(k)) { - bch2_btree_iter_unlock(&iter); - goto err; - } - - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&iter); + rbio = bch2_rbio_free(rbio); - if (!bkey_extent_is_data(k.k) || - !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k), - rbio->pick.ptr, - rbio->pos.offset - - rbio->pick.crc.offset) || - bkey_start_offset(k.k) != bvec_iter.bi_sector) - goto err; + if (ret == READ_RETRY_AVOID) { + __set_bit(pick.ptr.dev, avoid->d); + ret = READ_RETRY; + } - bch2_extent_pick_ptr(c, k, avoid, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, &rbio->bio, "no device to read from"); - bio_endio(&rbio->bio); - return; + return ret; } - if (!pick.ca) - goto err; +no_device_postclone: + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + bch2_rbio_free(rbio); +no_device: + __bcache_io_error(c, "no device to read from"); - if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) { - percpu_ref_put(&pick.ca->io_ref); - goto err; + if (likely(!(flags & BCH_READ_IN_RETRY))) { + orig->bio.bi_status = BLK_STS_IOERR; + if (flags & BCH_READ_LAST_FRAGMENT) + bio_endio(&orig->bio); + return 0; + } else { + return READ_ERR; } - ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k), - &pick, flags); - switch (ret) { - case READ_RETRY_AVOID: - __set_bit(pick.ca->dev_idx, avoid->d); - case READ_RETRY: - goto retry; - case READ_ERR: - bio_endio(&rbio->bio); - return; - }; - - return; -err: +hole: /* - * extent we wanted to read no longer exists, or - * was merged or partially overwritten (and thus - * possibly bigger than the memory that was - * originally allocated) + * won't normally happen in the BCH_READ_NODECODE + * (bch2_move_extent()) path, but if we retry and the extent we wanted + * to read no longer exists we have to signal that: */ - rbio->bio.bi_status = BLK_STS_AGAIN; - bio_endio(&rbio->bio); - return; + if (flags & BCH_READ_NODECODE) + orig->hole = true; + + zero_fill_bio_iter(&orig->bio, iter); + + if (flags & BCH_READ_LAST_FRAGMENT) + bio_endio(&orig->bio); + return 0; } -void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) +void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) { struct btree_iter iter; struct bkey_s_c k; + unsigned flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| + BCH_READ_USER_MAPPED; int ret; - EBUG_ON(flags & BCH_READ_NODECODE); -retry: + BUG_ON(rbio->_state); + BUG_ON(flags & BCH_READ_NODECODE); + BUG_ON(flags & BCH_READ_IN_RETRY); + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), + POS(inode, rbio->bio.bi_iter.bi_sector), BTREE_ITER_SLOTS, k) { BKEY_PADDED(k) tmp; - struct extent_pick_ptr pick; - struct bvec_iter fragment; + unsigned bytes; /* * Unlock the iterator while the btree node's lock is still in @@ -1513,49 +1611,20 @@ retry: k = bkey_i_to_s_c(&tmp.k); bch2_btree_iter_unlock(&iter); - bch2_extent_pick_ptr(c, k, avoid, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, &rbio->bio, "no device to read from"); - bio_endio(&rbio->bio); - return; - } + bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size, + (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9); + swap(rbio->bio.bi_iter.bi_size, bytes); - fragment = bvec_iter; - fragment.bi_size = (min_t(u64, k.k->p.offset, - bvec_iter_end_sector(bvec_iter)) - - bvec_iter.bi_sector) << 9; + if (rbio->bio.bi_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; - if (pick.ca) { - if (fragment.bi_size != bvec_iter.bi_size) { - bio_inc_remaining(&rbio->bio); - flags |= BCH_READ_MUST_CLONE; - trace_read_split(&rbio->bio); - } - - ret = __bch2_read_extent(c, rbio, fragment, - bkey_s_c_to_extent(k), - &pick, flags); - switch (ret) { - case READ_RETRY_AVOID: - __set_bit(pick.ca->dev_idx, avoid->d); - case READ_RETRY: - goto retry; - case READ_ERR: - rbio->bio.bi_status = BLK_STS_IOERR; - bio_endio(&rbio->bio); - return; - }; - } else { - zero_fill_bio_iter(&rbio->bio, fragment); - - if (fragment.bi_size == bvec_iter.bi_size) - bio_endio(&rbio->bio); - } + bch2_read_extent(c, rbio, k, flags); - if (fragment.bi_size == bvec_iter.bi_size) + if (flags & BCH_READ_LAST_FRAGMENT) return; - bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size); + swap(rbio->bio.bi_iter.bi_size, bytes); + bio_advance(&rbio->bio, bytes); } /* diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index a0c795abe9bd..ac7becbd5f21 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -99,40 +99,28 @@ struct cache_promote_op; struct extent_pick_ptr; int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - struct bkey_s_c_extent e, struct extent_pick_ptr *, - unsigned); -void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - u64, struct bch_devs_mask *, unsigned); + struct bkey_s_c, struct bch_devs_mask *, unsigned); +void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); enum bch_read_flags { BCH_READ_RETRY_IF_STALE = 1 << 0, BCH_READ_MAY_PROMOTE = 1 << 1, BCH_READ_USER_MAPPED = 1 << 2, BCH_READ_NODECODE = 1 << 3, + BCH_READ_LAST_FRAGMENT = 1 << 4, /* internal: */ - BCH_READ_MUST_BOUNCE = 1 << 4, - BCH_READ_MUST_CLONE = 1 << 5, - BCH_READ_IN_RETRY = 1 << 6, + BCH_READ_MUST_BOUNCE = 1 << 5, + BCH_READ_MUST_CLONE = 1 << 6, + BCH_READ_IN_RETRY = 1 << 7, }; static inline void bch2_read_extent(struct bch_fs *c, struct bch_read_bio *rbio, - struct bkey_s_c_extent e, - struct extent_pick_ptr *pick, + struct bkey_s_c k, unsigned flags) { - __bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags); -} - -static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - u64 inode) -{ - BUG_ON(rbio->_state); - __bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL, - BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED); + __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags); } static inline struct bch_read_bio *rbio_init(struct bio *bio, diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h index a022ab335428..f114a8c0aff8 100644 --- a/fs/bcachefs/io_types.h +++ b/fs/bcachefs/io_types.h @@ -36,16 +36,18 @@ struct bch_read_bio { struct bvec_iter bvec_iter; unsigned submit_time_us; - u8 flags; + u16 flags; union { struct { - u8 bounce:1, + u16 bounce:1, split:1, + have_ioref:1, narrow_crcs:1, + hole:1, retry:2, context:2; }; - u8 _state; + u16 _state; }; struct bch_devs_list devs_have; @@ -66,16 +68,16 @@ struct bch_read_bio { struct bch_write_bio { struct bch_fs *c; - struct bch_dev *ca; struct bch_write_bio *parent; struct bch_devs_list failed; u8 order; + u8 dev; unsigned split:1, bounce:1, put_bio:1, - have_io_ref:1, + have_ioref:1, used_mempool:1; unsigned submit_time_us; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 38235d646a94..dbb7dd2a5f9b 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -22,7 +22,6 @@ struct moving_io { struct closure cl; bool read_completed; - unsigned read_dev; unsigned read_sectors; unsigned write_sectors; @@ -42,7 +41,7 @@ struct moving_context { struct list_head reads; /* in flight sectors: */ - atomic_t read_sectors[BCH_SB_MEMBERS_MAX]; + atomic_t read_sectors; atomic_t write_sectors; wait_queue_head_t wait; @@ -306,7 +305,7 @@ static void move_write(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); - if (unlikely(io->rbio.bio.bi_status)) { + if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { closure_return_with_destructor(cl, move_free); return; } @@ -331,7 +330,7 @@ static void move_read_endio(struct bio *bio) struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); struct moving_context *ctxt = io->write.ctxt; - atomic_sub(io->read_sectors, &ctxt->read_sectors[io->read_dev]); + atomic_sub(io->read_sectors, &ctxt->read_sectors); io->read_completed = true; if (next_pending_write(ctxt)) @@ -377,7 +376,6 @@ static int bch2_move_extent(struct bch_fs *c, enum data_cmd data_cmd, struct data_opts data_opts) { - struct extent_pick_ptr pick; struct moving_io *io; const struct bch_extent_ptr *ptr; struct bch_extent_crc_unpacked crc; @@ -388,12 +386,8 @@ static int bch2_move_extent(struct bch_fs *c, atomic_read(&ctxt->write_sectors) < SECTORS_IN_FLIGHT_PER_DEVICE); - bch2_extent_pick_ptr(c, e.s_c, NULL, &pick); - if (IS_ERR_OR_NULL(pick.ca)) - return pick.ca ? PTR_ERR(pick.ca) : 0; - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->read_sectors[pick.ca->dev_idx]) < + atomic_read(&ctxt->read_sectors) < SECTORS_IN_FLIGHT_PER_DEVICE); /* write path might have to decompress data: */ @@ -407,8 +401,7 @@ static int bch2_move_extent(struct bch_fs *c, goto err; io->write.ctxt = ctxt; - io->read_dev = pick.ca->dev_idx; - io->read_sectors = pick.crc.uncompressed_size; + io->read_sectors = e.k->size; io->write_sectors = e.k->size; bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); @@ -422,6 +415,7 @@ static int bch2_move_extent(struct bch_fs *c, io->rbio.opts = io_opts; bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); + io->rbio.bio.bi_vcnt = pages; bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); io->rbio.bio.bi_iter.bi_size = sectors << 9; @@ -439,7 +433,7 @@ static int bch2_move_extent(struct bch_fs *c, trace_move_extent(e.k); - atomic_add(io->read_sectors, &ctxt->read_sectors[io->read_dev]); + atomic_add(io->read_sectors, &ctxt->read_sectors); list_add_tail(&io->list, &ctxt->reads); /* @@ -447,14 +441,15 @@ static int bch2_move_extent(struct bch_fs *c, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(c, &io->rbio, e, &pick, BCH_READ_NODECODE); + bch2_read_extent(c, &io->rbio, e.s_c, + BCH_READ_NODECODE| + BCH_READ_LAST_FRAGMENT); return 0; err_free_pages: bio_free_pages(&io->write.op.wbio.bio); err_free: kfree(io); err: - percpu_ref_put(&pick.ca->io_ref); trace_move_alloc_fail(e.k); return ret; } diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index a52ee3bb37ee..231bc5295740 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -27,7 +27,26 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) static inline bool bch2_dev_is_online(struct bch_dev *ca) { - return ca->disk_sb.bdev != NULL; + return !percpu_ref_is_zero(&ca->io_ref); +} + +static inline bool bch2_dev_is_readable(struct bch_dev *ca) +{ + return bch2_dev_is_online(ca) && + ca->mi.state != BCH_MEMBER_STATE_FAILED; +} + +static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) +{ + if (!percpu_ref_tryget(&ca->io_ref)) + return false; + + if (ca->mi.state == BCH_MEMBER_STATE_RW || + (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ)) + return true; + + percpu_ref_put(&ca->io_ref); + return false; } static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) |