diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2022-03-18 00:42:09 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2022-06-16 22:01:10 -0400 |
commit | 9b55cb8524e8cd0dfe02a33a68808050f61729b8 (patch) | |
tree | 4c9411af44ede6a88122808fc85665cede33bd38 | |
parent | 64b5dac13228b8a5caaefb7f5d2a17a3dd1cac84 (diff) |
bcachefs: Copygc now uses backpointers
Previously, copygc needed to walk the entire extents & reflink btrees to
find extents that needed to be moved.
Now that we have backpointers, this patch implements
bch2_evacuate_bucket() in the move code, which copygc now uses for
evacuating mostly empty buckets.
Also, thanks to the new backpointers code, copygc can now move btree
nodes.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r-- | fs/bcachefs/buckets_types.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/move.c | 285 | ||||
-rw-r--r-- | fs/bcachefs/move.h | 6 | ||||
-rw-r--r-- | fs/bcachefs/movinggc.c | 242 |
4 files changed, 272 insertions, 263 deletions
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 0a9dd5af3524..1dbba7d906dd 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -95,7 +95,7 @@ struct copygc_heap_entry { u8 replicas; u32 fragmentation; u32 sectors; - u64 offset; + u64 bucket; }; typedef HEAP(struct copygc_heap_entry) copygc_heap; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index a8f6d5a3e1ba..36d20dc8aaf3 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "bkey_buf.h" #include "btree_gc.h" #include "btree_update.h" @@ -9,6 +11,7 @@ #include "buckets.h" #include "disk_groups.h" #include "ec.h" +#include "error.h" #include "inode.h" #include "io.h" #include "journal_reclaim.h" @@ -632,6 +635,70 @@ err: return ret; } +static int move_ratelimit(struct btree_trans *trans, + struct moving_context *ctxt, + struct bch_ratelimit *rate) +{ + u64 delay; + + do { + delay = rate ? bch2_ratelimit_delay(rate) : 0; + + if (delay) { + bch2_trans_unlock(trans); + set_current_state(TASK_INTERRUPTIBLE); + } + + if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + return 1; + } + + if (delay) + schedule_timeout(delay); + + if (unlikely(freezing(current))) { + move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads)); + try_to_freeze(); + } + } while (delay); + + move_ctxt_wait_event(ctxt, trans, + atomic_read(&ctxt->write_sectors) < + SECTORS_IN_FLIGHT_PER_DEVICE); + + move_ctxt_wait_event(ctxt, trans, + atomic_read(&ctxt->read_sectors) < + SECTORS_IN_FLIGHT_PER_DEVICE); + + return 0; +} + +static int move_get_io_opts(struct btree_trans *trans, + struct bch_io_opts *io_opts, + struct bkey_s_c k, u64 *cur_inum) +{ + struct bch_inode_unpacked inode; + int ret; + + if (*cur_inum == k.k->p.inode) + return 0; + + *io_opts = bch2_opts_to_inode_opts(trans->c->opts); + + ret = lookup_inode(trans, + SPOS(0, k.k->p.inode, k.k->p.snapshot), + &inode); + if (ret == -EINTR) + return ret; + + if (!ret) + bch2_io_opts_apply(io_opts, bch2_inode_opts_get(&inode)); + + *cur_inum = k.k->p.inode; + return 0; +} + static int __bch2_move_data(struct bch_fs *c, struct moving_context *ctxt, struct bch_ratelimit *rate, @@ -642,7 +709,6 @@ static int __bch2_move_data(struct bch_fs *c, struct bch_move_stats *stats, enum btree_id btree_id) { - bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct bkey_buf sk; struct btree_trans trans; @@ -650,7 +716,7 @@ static int __bch2_move_data(struct bch_fs *c, struct bkey_s_c k; struct data_opts data_opts; enum data_cmd data_cmd; - u64 delay, cur_inum = U64_MAX; + u64 cur_inum = U64_MAX; int ret = 0, ret2; bch2_bkey_buf_init(&sk); @@ -667,37 +733,7 @@ static int __bch2_move_data(struct bch_fs *c, if (rate) bch2_ratelimit_reset(rate); - while (1) { - do { - delay = rate ? bch2_ratelimit_delay(rate) : 0; - - if (delay) { - bch2_trans_unlock(&trans); - set_current_state(TASK_INTERRUPTIBLE); - } - - if (kthread && (ret = kthread_should_stop())) { - __set_current_state(TASK_RUNNING); - goto out; - } - - if (delay) - schedule_timeout(delay); - - if (unlikely(freezing(current))) { - move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads)); - try_to_freeze(); - } - } while (delay); - - move_ctxt_wait_event(ctxt, &trans, - atomic_read(&ctxt->write_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); - - move_ctxt_wait_event(ctxt, &trans, - atomic_read(&ctxt->read_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); - + while (!move_ratelimit(&trans, ctxt, rate)) { bch2_trans_begin(&trans); k = bch2_btree_iter_peek(&iter); @@ -718,23 +754,9 @@ static int __bch2_move_data(struct bch_fs *c, if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - if (btree_id == BTREE_ID_extents && - cur_inum != k.k->p.inode) { - struct bch_inode_unpacked inode; - - io_opts = bch2_opts_to_inode_opts(c->opts); - - ret = lookup_inode(&trans, - SPOS(0, k.k->p.inode, k.k->p.snapshot), - &inode); - if (ret == -EINTR) - continue; - - if (!ret) - bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); - - cur_inum = k.k->p.inode; - } + ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); + if (ret) + continue; switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { case DATA_SKIP: @@ -779,7 +801,6 @@ next: next_nondata: bch2_btree_iter_advance(&iter); } -out: bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); @@ -848,7 +869,6 @@ int bch2_move_data(struct bch_fs *c, break; } - move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); @@ -862,6 +882,167 @@ int bch2_move_data(struct bch_fs *c, return ret; } +static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + + if (!ret && k.k->type == KEY_TYPE_alloc_v4) { + struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + + if (a.v->gen == gen && + a.v->dirty_sectors) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "failed to evacuate bucket "); + bch2_bkey_val_to_text(&buf, c, k); + + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_evacuate_bucket(struct bch_fs *c, + struct bpos bucket, int gen, + struct bch_ratelimit *rate, + struct write_point_specifier wp, + enum data_cmd data_cmd, + struct data_opts *data_opts, + struct bch_move_stats *stats) +{ + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct moving_context ctxt = { .stats = stats }; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_buf sk; + struct bch_backpointer bp; + u64 bp_offset = 0, cur_inum = U64_MAX; + int ret = 0; + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + progress_list_add(c, stats); + closure_init_stack(&ctxt.cl); + INIT_LIST_HEAD(&ctxt.reads); + init_waitqueue_head(&ctxt.wait); + + stats->data_type = BCH_DATA_user; + + while (!(ret = move_ratelimit(&trans, &ctxt, rate))) { + bch2_trans_begin(&trans); + + ret = bch2_get_next_backpointer(&trans, bucket, gen, + &bp_offset, &bp); + if (ret == -EINTR) + continue; + if (ret) + goto err; + if (bp_offset == U64_MAX) + break; + + if (!bp.level) { + struct bkey_s_c k; + + k = bch2_backpointer_get_key(&trans, &iter, + bucket, bp_offset, bp); + ret = bkey_err(k); + if (ret == -EINTR) + continue; + if (ret) + goto err; + if (!k.k) + continue; + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + bch2_trans_iter_exit(&trans, &iter); + + ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); + if (ret) + continue; + + data_opts->target = io_opts.background_target; + data_opts->rewrite_dev = bucket.inode; + + ret = bch2_move_extent(&trans, &ctxt, wp, io_opts, bp.btree_id, k, + data_cmd, *data_opts); + if (ret == -EINTR) + continue; + if (ret == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(&ctxt, &trans); + continue; + } + if (ret) + goto err; + + if (rate) + bch2_ratelimit_increment(rate, k.k->size); + atomic64_add(k.k->size, &stats->sectors_seen); + } else { + struct btree *b; + + b = bch2_backpointer_get_node(&trans, &iter, + bucket, bp_offset, bp); + ret = PTR_ERR_OR_ZERO(b); + if (ret == -EINTR) + continue; + if (ret) + goto err; + if (!b) + continue; + + ret = bch2_btree_node_rewrite(&trans, &iter, b, 0); + bch2_trans_iter_exit(&trans, &iter); + + if (ret == -EINTR) + continue; + if (ret) + goto err; + + if (rate) + bch2_ratelimit_increment(rate, c->opts.btree_node_size >> 9); + atomic64_add(c->opts.btree_node_size >> 9, &stats->sectors_seen); + atomic64_add(c->opts.btree_node_size >> 9, &stats->sectors_moved); + } + + bp_offset++; + } + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) { + bch2_trans_unlock(&trans); + move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); + closure_sync(&ctxt.cl); + lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen)); + } +err: + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); + closure_sync(&ctxt.cl); + progress_list_del(c, stats); + + EBUG_ON(atomic_read(&ctxt.write_sectors)); + + trace_move_data(c, + atomic64_read(&stats->sectors_moved), + atomic64_read(&stats->keys_moved)); + + return ret; +} + typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *, struct btree *, struct bch_io_opts *, struct data_opts *); diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 2a789a1158ca..c69b6b5abe9e 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -62,6 +62,12 @@ int bch2_move_data(struct bch_fs *, move_pred_fn, void *, struct bch_move_stats *); +int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int, + struct bch_ratelimit *, + struct write_point_specifier, + enum data_cmd, + struct data_opts *, + struct bch_move_stats *); int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 99980c3d5d55..efb09e1c3652 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -30,80 +30,6 @@ #include <linux/sort.h> #include <linux/wait.h> -static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) -{ - const struct copygc_heap_entry *l = _l; - const struct copygc_heap_entry *r = _r; - - return cmp_int(l->dev, r->dev) ?: - cmp_int(l->offset, r->offset); -} - -static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) -{ - copygc_heap *h = &c->copygc_heap; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p = { 0 }; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct copygc_heap_entry search = { - .dev = p.ptr.dev, - .offset = p.ptr.offset, - }; - ssize_t i; - - if (p.ptr.cached) - continue; - - i = eytzinger0_find_le(h->data, h->used, - sizeof(h->data[0]), - bucket_offset_cmp, &search); -#if 0 - /* eytzinger search verify code: */ - ssize_t j = -1, k; - - for (k = 0; k < h->used; k++) - if (h->data[k].offset <= ptr->offset && - (j < 0 || h->data[k].offset > h->data[j].offset)) - j = k; - - BUG_ON(i != j); -#endif - if (i >= 0 && - p.ptr.dev == h->data[i].dev && - p.ptr.offset < h->data[i].offset + ca->mi.bucket_size && - p.ptr.gen == h->data[i].gen) { - /* - * We need to use the journal reserve here, because - * - journal reclaim depends on btree key cache - * flushing to make forward progress, - * - which has to make forward progress when the - * journal is pre-reservation full, - * - and depends on allocation - meaning allocator and - * copygc - */ - - data_opts->target = io_opts->background_target; - data_opts->nr_replicas = 1; - data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE| - JOURNAL_WATERMARK_copygc; - data_opts->rewrite_dev = p.ptr.dev; - - if (p.has_ec) - data_opts->nr_replicas += p.ec.redundancy; - - return DATA_REWRITE; - } - } - - return DATA_SKIP; -} - static inline int fragmentation_cmp(copygc_heap *heap, struct copygc_heap_entry l, struct copygc_heap_entry r) @@ -111,7 +37,7 @@ static inline int fragmentation_cmp(copygc_heap *heap, return cmp_int(l.fragmentation, r.fragmentation); } -static int walk_buckets_to_copygc(struct bch_fs *c) +static int find_buckets_to_copygc(struct bch_fs *c) { copygc_heap *h = &c->copygc_heap; struct btree_trans trans; @@ -122,6 +48,14 @@ static int walk_buckets_to_copygc(struct bch_fs *c) bch2_trans_init(&trans, c, 0, 0); + /* + * Find buckets with lowest sector counts, skipping completely + * empty buckets, by building a maxheap sorted by sector count, + * and repeatedly replacing the maximum element until all + * buckets have been visited. + */ + h->used = 0; + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode); @@ -129,7 +63,8 @@ static int walk_buckets_to_copygc(struct bch_fs *c) bch2_alloc_to_v4(k, &a); - if (a.data_type != BCH_DATA_user || + if ((a.data_type != BCH_DATA_btree && + a.data_type != BCH_DATA_user) || a.dirty_sectors >= ca->mi.bucket_size || bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset)) continue; @@ -141,7 +76,7 @@ static int walk_buckets_to_copygc(struct bch_fs *c) .fragmentation = div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size), .sectors = a.dirty_sectors, - .offset = bucket_to_sector(ca, iter.pos.offset), + .bucket = iter.pos.offset, }; heap_add_or_replace(h, e, -fragmentation_cmp, NULL); @@ -152,77 +87,22 @@ static int walk_buckets_to_copygc(struct bch_fs *c) return ret; } -static int bucket_inorder_cmp(const void *_l, const void *_r) -{ - const struct copygc_heap_entry *l = _l; - const struct copygc_heap_entry *r = _r; - - return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset); -} - -static int check_copygc_was_done(struct bch_fs *c, - u64 *sectors_not_moved, - u64 *buckets_not_moved) -{ - copygc_heap *h = &c->copygc_heap; - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 a; - struct copygc_heap_entry *i; - int ret = 0; - - sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL); - - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0); - - for (i = h->data; i < h->data + h->used; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); - - bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset))); - - ret = lockrestart_do(&trans, - bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - if (ret) - break; - - bch2_alloc_to_v4(k, &a); - - if (a.gen == i->gen && a.dirty_sectors) { - *sectors_not_moved += a.dirty_sectors; - *buckets_not_moved += 1; - } - } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - return ret; -} - static int bch2_copygc(struct bch_fs *c) { copygc_heap *h = &c->copygc_heap; - struct copygc_heap_entry e, *i; + struct copygc_heap_entry e; struct bch_move_stats move_stats; - u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0; - u64 sectors_reserved = 0; - u64 buckets_to_move, buckets_not_moved = 0; struct bch_dev *ca; unsigned dev_idx; size_t heap_size = 0; + struct data_opts data_opts = { + .nr_replicas = 1, + .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc, + }; int ret; bch_move_stats_init(&move_stats, "copygc"); - /* - * Find buckets with lowest sector counts, skipping completely - * empty buckets, by building a maxheap sorted by sector count, - * and repeatedly replacing the maximum element until all - * buckets have been visited. - */ - h->used = 0; - for_each_rw_member(ca, c, dev_idx) heap_size += ca->mi.nbuckets >> 7; @@ -234,21 +114,7 @@ static int bch2_copygc(struct bch_fs *c) } } - for_each_rw_member(ca, c, dev_idx) { - struct bch_dev_usage usage = bch2_dev_usage_read(ca); - - u64 avail = max_t(s64, 0, - usage.d[BCH_DATA_free].buckets + - usage.d[BCH_DATA_need_discard].buckets - - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, RESERVE_movinggc)); - - avail = min(avail, ca->mi.nbuckets >> 6); - - sectors_reserved += avail * ca->mi.bucket_size; - } - - ret = walk_buckets_to_copygc(c); + ret = find_buckets_to_copygc(c); if (ret) { bch2_fs_fatal_error(c, "error walking buckets to copygc!"); return ret; @@ -259,68 +125,24 @@ static int bch2_copygc(struct bch_fs *c) return 0; } - /* - * Our btree node allocations also come out of RESERVE_movingc: - */ - sectors_reserved = (sectors_reserved * 3) / 4; - if (!sectors_reserved) { - bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); - return -1; - } + heap_resort(h, fragmentation_cmp, NULL); - for (i = h->data; i < h->data + h->used; i++) { - sectors_to_move += i->sectors; - sectors_to_write += i->sectors * i->replicas; - } - - while (sectors_to_write > sectors_reserved) { + while (h->used) { BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); - sectors_to_write -= e.sectors * e.replicas; - } - - buckets_to_move = h->used; - - if (!buckets_to_move) { - bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!", - sectors_reserved); - return 0; - } - - eytzinger0_sort(h->data, h->used, - sizeof(h->data[0]), - bucket_offset_cmp, NULL); - - ret = bch2_move_data(c, - 0, POS_MIN, - BTREE_ID_NR, POS_MAX, - NULL, - writepoint_ptr(&c->copygc_write_point), - copygc_pred, NULL, - &move_stats); - if (ret < 0) - bch_err(c, "error %i from bch2_move_data() in copygc", ret); - if (ret) - return ret; - - ret = check_copygc_was_done(c, §ors_not_moved, &buckets_not_moved); - if (ret) { - bch_err(c, "error %i from check_copygc_was_done()", ret); - return ret; + /* not correct w.r.t. device removal */ + + ret = bch2_evacuate_bucket(c, POS(e.dev, e.bucket), e.gen, NULL, + writepoint_ptr(&c->copygc_write_point), + DATA_REWRITE, &data_opts, + &move_stats); + if (ret < 0) + bch_err(c, "error %i from bch2_move_data() in copygc", ret); + if (ret) + return ret; } - if (sectors_not_moved) - bch_warn_ratelimited(c, - "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", - sectors_not_moved, sectors_to_move, - buckets_not_moved, buckets_to_move, - atomic64_read(&move_stats.sectors_moved), - atomic64_read(&move_stats.keys_raced), - atomic64_read(&move_stats.sectors_raced)); - - trace_copygc(c, - atomic64_read(&move_stats.sectors_moved), sectors_not_moved, - buckets_to_move, buckets_not_moved); - return 0; + trace_copygc(c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0); + return ret; } /* |