diff options
Diffstat (limited to 'fs/bcachefs/tier.c')
-rw-r--r-- | fs/bcachefs/tier.c | 305 |
1 files changed, 190 insertions, 115 deletions
diff --git a/fs/bcachefs/tier.c b/fs/bcachefs/tier.c index e992ab44ed3a..211a844c69cf 100644 --- a/fs/bcachefs/tier.c +++ b/fs/bcachefs/tier.c @@ -12,172 +12,247 @@ #include <linux/freezer.h> #include <linux/kthread.h> +#include <linux/sched/cputime.h> #include <trace/events/bcachefs.h> -static bool __tiering_pred(struct bch_fs *c, struct bch_tier *tier, - struct bkey_s_c_extent e) +static inline bool rebalance_ptr_pred(struct bch_fs *c, + const struct bch_extent_ptr *ptr, + struct bch_extent_crc_unpacked crc, + struct bch_io_opts *io_opts) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (io_opts->background_target && + !dev_in_target(ca, io_opts->background_target) && + !ptr->cached) + return true; + + if (io_opts->background_compression && + crc.compression_type != + bch2_compression_opt_to_type[io_opts->background_compression]) + return true; + + return false; +} + +void bch2_rebalance_add_key(struct bch_fs *c, + struct bkey_s_c k, + struct bch_io_opts *io_opts) +{ + const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + struct bkey_s_c_extent e; + + if (!bkey_extent_is_data(k.k)) + return; + + if (!io_opts->background_target && + !io_opts->background_compression) + return; + + e = bkey_s_c_to_extent(k); + + extent_for_each_ptr_crc(e, ptr, crc) + if (rebalance_ptr_pred(c, ptr, crc, io_opts)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (!atomic64_add_return(crc.compressed_size, + &ca->rebalance_work)) + rebalance_wakeup(c); + } +} + +void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) +{ + if (!atomic64_add_return(sectors, &c->rebalance_work_unknown_dev)) + rebalance_wakeup(c); +} + +static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, + enum bkey_type type, + struct bkey_s_c_extent e, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) { const struct bch_extent_ptr *ptr; - unsigned replicas = 0; + struct bch_extent_crc_unpacked crc; /* Make sure we have room to add a new pointer: */ if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > BKEY_EXTENT_VAL_U64s_MAX) - return false; + return DATA_SKIP; - extent_for_each_ptr(e, ptr) - if (bch_dev_bkey_exists(c, ptr->dev)->mi.tier >= tier->idx) - replicas++; + extent_for_each_ptr_crc(e, ptr, crc) + if (rebalance_ptr_pred(c, ptr, crc, io_opts)) + goto found; - return replicas < c->opts.data_replicas; + return DATA_SKIP; +found: + data_opts->target = io_opts->background_target; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; } -static enum data_cmd tiering_pred(struct bch_fs *c, void *arg, - enum bkey_type type, - struct bkey_s_c_extent e, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) +struct rebalance_work { + unsigned dev_most_full_percent; + u64 dev_most_full_work; + u64 dev_most_full_capacity; + u64 total_work; +}; + +static struct rebalance_work rebalance_work(struct bch_fs *c) { - struct bch_tier *tier = arg; + struct bch_dev *ca; + struct rebalance_work ret = { 0 }; + unsigned i; - if (!__tiering_pred(c, tier, e)) - return DATA_SKIP; + for_each_online_member(ca, c, i) { + u64 capacity = bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket); + u64 work = atomic64_read(&ca->rebalance_work) + + atomic64_read(&c->rebalance_work_unknown_dev); + unsigned percent_full = div_u64(work * 100, capacity); + + if (percent_full > ret.dev_most_full_percent) { + ret.dev_most_full_percent = percent_full; + ret.dev_most_full_work = work; + ret.dev_most_full_capacity = capacity; + } - data_opts->btree_insert_flags = 0; - return DATA_ADD_REPLICAS; + ret.total_work += atomic64_read(&ca->rebalance_work); + } + + ret.total_work += atomic64_read(&c->rebalance_work_unknown_dev); + + return ret; } -static int bch2_tiering_thread(void *arg) +static void rebalance_work_reset(struct bch_fs *c) { - struct bch_tier *tier = arg; - struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]); - struct io_clock *clock = &c->io_clock[WRITE]; struct bch_dev *ca; - struct bch_move_stats move_stats; - u64 tier_capacity, available_sectors; - unsigned long last; - unsigned i, nr_devices; + unsigned i; + + for_each_online_member(ca, c, i) + atomic64_set(&ca->rebalance_work, 0); + + atomic64_set(&c->rebalance_work_unknown_dev, 0); +} + +static unsigned long curr_cputime(void) +{ + u64 utime, stime; + + task_cputime_adjusted(current, &utime, &stime); + return nsecs_to_jiffies(utime + stime); +} + +static int bch2_rebalance_thread(void *arg) +{ + struct bch_fs *c = arg; + struct io_clock *clock = &c->io_clock[WRITE]; + struct rebalance_work w, p; + unsigned long start, prev_start; + unsigned long prev_run_time, prev_run_cputime; + unsigned long cputime, prev_cputime; - memset(&move_stats, 0, sizeof(move_stats)); set_freezable(); - while (!kthread_should_stop()) { - if (kthread_wait_freezable(c->tiering_enabled && - (nr_devices = dev_mask_nr(&tier->devs)))) - break; - - while (1) { - struct bch_tier *faster_tier; - - last = atomic_long_read(&clock->now); - - tier_capacity = available_sectors = 0; - for (faster_tier = c->tiers; - faster_tier != tier; - faster_tier++) { - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &faster_tier->devs) { - tier_capacity += - bucket_to_sector(ca, - ca->mi.nbuckets - - ca->mi.first_bucket); - available_sectors += - bucket_to_sector(ca, - dev_buckets_available(c, ca)); - } - rcu_read_unlock(); - } + p = rebalance_work(c); + prev_start = jiffies; + prev_cputime = curr_cputime(); + + while (!kthread_wait_freezable(c->rebalance_enabled)) { + struct bch_move_stats move_stats = { 0 }; - if (available_sectors < (tier_capacity >> 1)) - break; + w = rebalance_work(c); + start = jiffies; + cputime = curr_cputime(); + + prev_run_time = start - prev_start; + prev_run_cputime = cputime - prev_cputime; + + if (!w.total_work) { + kthread_wait_freezable(rebalance_work(c).total_work); + continue; + } - bch2_kthread_io_clock_wait(clock, - last + - available_sectors - - (tier_capacity >> 1)); - if (kthread_should_stop()) - return 0; + if (w.dev_most_full_percent < 20 && + prev_run_cputime * 5 > prev_run_time) { + if (w.dev_most_full_capacity) { + bch2_kthread_io_clock_wait(clock, + atomic_long_read(&clock->now) + + div_u64(w.dev_most_full_capacity, 5)); + } else { + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + + schedule_timeout(prev_run_cputime * 5 - + prev_run_time); + continue; + } } - bch2_move_data(c, &tier->pd.rate, - &tier->devs, - writepoint_ptr(&tier->wp), + /* minimum 1 mb/sec: */ + c->rebalance_pd.rate.rate = + max_t(u64, 1 << 11, + c->rebalance_pd.rate.rate * + max(p.dev_most_full_percent, 1U) / + max(w.dev_most_full_percent, 1U)); + + rebalance_work_reset(c); + + bch2_move_data(c, &c->rebalance_pd.rate, + writepoint_ptr(&c->rebalance_write_point), POS_MIN, POS_MAX, - tiering_pred, tier, + rebalance_pred, NULL, &move_stats); } return 0; } -static void __bch2_tiering_stop(struct bch_tier *tier) +void bch2_rebalance_stop(struct bch_fs *c) { - tier->pd.rate.rate = UINT_MAX; - bch2_ratelimit_reset(&tier->pd.rate); - - if (tier->migrate) - kthread_stop(tier->migrate); + struct task_struct *p; - tier->migrate = NULL; -} + c->rebalance_pd.rate.rate = UINT_MAX; + bch2_ratelimit_reset(&c->rebalance_pd.rate); -void bch2_tiering_stop(struct bch_fs *c) -{ - struct bch_tier *tier; + p = c->rebalance_thread; + c->rebalance_thread = NULL; - for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) - __bch2_tiering_stop(tier); -} + if (p) { + /* for sychronizing with rebalance_wakeup() */ + synchronize_rcu(); -static int __bch2_tiering_start(struct bch_tier *tier) -{ - if (!tier->migrate) { - struct task_struct *p = - kthread_create(bch2_tiering_thread, tier, - "bch_tier[%u]", tier->idx); - if (IS_ERR(p)) - return PTR_ERR(p); - - tier->migrate = p; + kthread_stop(p); + put_task_struct(p); } - - wake_up_process(tier->migrate); - return 0; } -int bch2_tiering_start(struct bch_fs *c) +int bch2_rebalance_start(struct bch_fs *c) { - struct bch_tier *tier; - bool have_faster_tier = false; + struct task_struct *p; if (c->opts.nochanges) return 0; - for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { - if (!dev_mask_nr(&tier->devs)) - continue; - - if (have_faster_tier) { - int ret = __bch2_tiering_start(tier); - if (ret) - return ret; - } else { - __bch2_tiering_stop(tier); - } + p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); + if (IS_ERR(p)) + return PTR_ERR(p); - have_faster_tier = true; - } + get_task_struct(p); + rcu_assign_pointer(c->rebalance_thread, p); + wake_up_process(c->rebalance_thread); return 0; } -void bch2_fs_tiering_init(struct bch_fs *c) +void bch2_fs_rebalance_init(struct bch_fs *c) { - unsigned i; + bch2_pd_controller_init(&c->rebalance_pd); - for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { - c->tiers[i].idx = i; - bch2_pd_controller_init(&c->tiers[i].pd); - } + atomic64_set(&c->rebalance_work_unknown_dev, S64_MAX); } |