summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-02-19 16:29:16 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2018-02-19 18:44:07 -0500
commit2c997de0e437923ceb6b0d2f7c44c7106ccf5545 (patch)
treec5cd353a16d5caa542e47ce913b25a76f156d9a2
parent02d7c0e4b51b52fccb937617b4b448ed278e17b6 (diff)
bcachefs: Convert tiering/promote code to disk groups
-rw-r--r--fs/bcachefs/alloc.c168
-rw-r--r--fs/bcachefs/alloc.h2
-rw-r--r--fs/bcachefs/bcachefs.h29
-rw-r--r--fs/bcachefs/bcachefs_format.h21
-rw-r--r--fs/bcachefs/btree_update_interior.c2
-rw-r--r--fs/bcachefs/extents.c50
-rw-r--r--fs/bcachefs/extents.h2
-rw-r--r--fs/bcachefs/fs-io.c16
-rw-r--r--fs/bcachefs/io.c46
-rw-r--r--fs/bcachefs/io.h11
-rw-r--r--fs/bcachefs/io_types.h3
-rw-r--r--fs/bcachefs/move.c29
-rw-r--r--fs/bcachefs/move.h3
-rw-r--r--fs/bcachefs/movinggc.c11
-rw-r--r--fs/bcachefs/opts.h14
-rw-r--r--fs/bcachefs/super-io.c67
-rw-r--r--fs/bcachefs/super-io.h28
-rw-r--r--fs/bcachefs/super.c43
-rw-r--r--fs/bcachefs/super_types.h1
-rw-r--r--fs/bcachefs/sysfs.c64
-rw-r--r--fs/bcachefs/tier.c305
-rw-r--r--fs/bcachefs/tier.h21
-rw-r--r--fs/bcachefs/xattr.c6
-rw-r--r--include/trace/events/bcachefs.h4
24 files changed, 501 insertions, 445 deletions
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index 339ffd02c45f..a76f2b7cc48a 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -89,69 +89,29 @@ static void pd_controllers_update(struct work_struct *work)
struct bch_fs,
pd_controllers_update);
struct bch_dev *ca;
- unsigned i, iter;
-
- /* All units are in bytes */
- u64 faster_tiers_size = 0;
- u64 faster_tiers_dirty = 0;
-
- u64 copygc_can_free = 0;
-
- rcu_read_lock();
- for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
- bch2_pd_controller_update(&c->tiers[i].pd,
- div_u64(faster_tiers_size *
- c->tiering_percent, 100),
- faster_tiers_dirty,
- -1);
-
- for_each_member_device_rcu(ca, c, iter, &c->tiers[i].devs) {
- struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
-
- u64 size = bucket_to_sector(ca, ca->mi.nbuckets -
- ca->mi.first_bucket) << 9;
- u64 dirty = bucket_to_sector(ca,
- stats.buckets[BCH_DATA_USER]) << 9;
- u64 free = bucket_to_sector(ca,
- __dev_buckets_free(ca, stats)) << 9;
- /*
- * Bytes of internal fragmentation, which can be
- * reclaimed by copy GC
- */
- s64 fragmented = (bucket_to_sector(ca,
- stats.buckets[BCH_DATA_USER] +
- stats.buckets[BCH_DATA_CACHED]) -
- (stats.sectors[BCH_DATA_USER] +
- stats.sectors[BCH_DATA_CACHED])) << 9;
+ unsigned i;
- fragmented = max(0LL, fragmented);
+ for_each_member_device(ca, c, i) {
+ struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
- bch2_pd_controller_update(&ca->copygc_pd,
- free, fragmented, -1);
+ u64 free = bucket_to_sector(ca,
+ __dev_buckets_free(ca, stats)) << 9;
+ /*
+ * Bytes of internal fragmentation, which can be
+ * reclaimed by copy GC
+ */
+ s64 fragmented = (bucket_to_sector(ca,
+ stats.buckets[BCH_DATA_USER] +
+ stats.buckets[BCH_DATA_CACHED]) -
+ (stats.sectors[BCH_DATA_USER] +
+ stats.sectors[BCH_DATA_CACHED])) << 9;
- faster_tiers_size += size;
- faster_tiers_dirty += dirty;
+ fragmented = max(0LL, fragmented);
- copygc_can_free += fragmented;
- }
+ bch2_pd_controller_update(&ca->copygc_pd,
+ free, fragmented, -1);
}
- rcu_read_unlock();
-
- /*
- * Throttle foreground writes if tier 0 is running out of free buckets,
- * and either tiering or copygc can free up space.
- *
- * Target will be small if there isn't any work to do - we don't want to
- * throttle foreground writes if we currently have all the free space
- * we're ever going to have.
- *
- * Otherwise, if there's work to do, try to keep 20% of tier0 available
- * for foreground writes.
- */
- if (c->fastest_tier)
- copygc_can_free = U64_MAX;
-
schedule_delayed_work(&c->pd_controllers_update,
c->pd_controllers_update_seconds * HZ);
}
@@ -1201,22 +1161,14 @@ out:
return ob - c->open_buckets;
}
-static int __dev_alloc_cmp(struct bch_fs *c,
- struct write_point *wp,
+static int __dev_alloc_cmp(struct write_point *wp,
unsigned l, unsigned r)
{
- struct bch_dev *ca_l = rcu_dereference(c->devs[l]);
- struct bch_dev *ca_r = rcu_dereference(c->devs[r]);
-
- if (ca_l && ca_r && ca_l->mi.tier != ca_r->mi.tier)
- return ((ca_l->mi.tier > ca_r->mi.tier) -
- (ca_l->mi.tier < ca_r->mi.tier));
-
return ((wp->next_alloc[l] > wp->next_alloc[r]) -
(wp->next_alloc[l] < wp->next_alloc[r]));
}
-#define dev_alloc_cmp(l, r) __dev_alloc_cmp(c, wp, l, r)
+#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r)
struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
struct write_point *wp,
@@ -1355,7 +1307,7 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
static void writepoint_drop_ptrs(struct bch_fs *c,
struct write_point *wp,
- struct bch_devs_mask *devs,
+ u16 target, bool in_target,
unsigned nr_ptrs_dislike)
{
int i;
@@ -1367,7 +1319,8 @@ static void writepoint_drop_ptrs(struct bch_fs *c,
struct open_bucket *ob = wp->ptrs[i];
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
- if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) {
+ if (nr_ptrs_dislike &&
+ dev_in_target(ca, target) == in_target) {
BUG_ON(ca->open_buckets_partial_nr >=
ARRAY_SIZE(ca->open_buckets_partial));
@@ -1401,7 +1354,7 @@ static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
}
static int open_bucket_add_buckets(struct bch_fs *c,
- struct bch_devs_mask *_devs,
+ u16 target,
struct write_point *wp,
struct bch_devs_list *devs_have,
unsigned nr_replicas,
@@ -1422,8 +1375,15 @@ static int open_bucket_add_buckets(struct bch_fs *c,
writepoint_for_each_ptr(wp, ob, i)
__clear_bit(ob->ptr.dev, devs.d);
- if (_devs)
- bitmap_and(devs.d, devs.d, _devs->d, BCH_SB_MEMBERS_MAX);
+ if (target) {
+ const struct bch_devs_mask *t;
+
+ rcu_read_lock();
+ t = bch2_target_to_mask(c, target);
+ if (t)
+ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+ rcu_read_unlock();
+ }
return bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl);
}
@@ -1503,7 +1463,7 @@ out:
* Get us an open_bucket we can allocate from, return with it locked:
*/
struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
- struct bch_devs_mask *devs,
+ unsigned target,
struct write_point_specifier write_point,
struct bch_devs_list *devs_have,
unsigned nr_replicas,
@@ -1525,17 +1485,27 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
writepoint_for_each_ptr(wp, ob, i)
if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev))
nr_ptrs_have++;
- else if (devs && !test_bit(ob->ptr.dev, devs->d))
+ else if (!dev_in_target(c->devs[ob->ptr.dev], target))
nr_ptrs_dislike++;
- ret = open_bucket_add_buckets(c, devs, wp, devs_have,
+ ret = open_bucket_add_buckets(c, target, wp, devs_have,
nr_replicas + nr_ptrs_have + nr_ptrs_dislike,
reserve, cl);
if (ret && ret != -EROFS)
goto err;
- if (wp->nr_ptrs <
- nr_ptrs_have + nr_ptrs_dislike + nr_replicas_required) {
+ if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+ goto alloc_done;
+
+ ret = open_bucket_add_buckets(c, target, wp, devs_have,
+ nr_replicas + nr_ptrs_have,
+ reserve, cl);
+ if (ret && ret != -EROFS)
+ goto err;
+alloc_done:
+ if (wp->nr_ptrs - nr_ptrs_have -
+ ((flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) ? nr_ptrs_dislike : 0)
+ < nr_replicas_required) {
ret = -EROFS;
goto err;
}
@@ -1545,7 +1515,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
0, nr_ptrs_dislike);
/* Remove pointers we don't want to use: */
- writepoint_drop_ptrs(c, wp, devs, nr_ptrs_dislike);
+ writepoint_drop_ptrs(c, wp, target, false, nr_ptrs_dislike);
/*
* Move pointers to devices we already have to end of open bucket
@@ -1637,7 +1607,6 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
void bch2_recalc_capacity(struct bch_fs *c)
{
- struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
struct bch_dev *ca;
u64 total_capacity, capacity = 0, reserved_sectors = 0;
unsigned long ra_pages = 0;
@@ -1653,28 +1622,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
bch2_set_ra_pages(c, ra_pages);
- /* Find fastest, slowest tiers with devices: */
-
- for (tier = c->tiers;
- tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
- if (!dev_mask_nr(&tier->devs))
- continue;
- if (!fastest_tier)
- fastest_tier = tier;
- slowest_tier = tier;
- }
-
- c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
- c->fastest_devs = fastest_tier != slowest_tier ? &fastest_tier->devs : NULL;
-
- if (!fastest_tier)
- goto set_capacity;
-
- /*
- * Capacity of the filesystem is the capacity of all the devices in the
- * slowest (highest) tier - we don't include lower tier devices.
- */
- for_each_member_device_rcu(ca, c, i, &slowest_tier->devs) {
+ for_each_rw_member(ca, c, i) {
size_t reserve = 0;
/*
@@ -1700,16 +1648,14 @@ void bch2_recalc_capacity(struct bch_fs *c)
reserve += ARRAY_SIZE(c->write_points);
- if (ca->mi.tier)
- reserve += 1; /* tiering write point */
- reserve += 1; /* btree write point */
+ reserve += 1; /* btree write point */
reserved_sectors += bucket_to_sector(ca, reserve);
capacity += bucket_to_sector(ca, ca->mi.nbuckets -
ca->mi.first_bucket);
}
-set_capacity:
+
total_capacity = capacity;
capacity *= (100 - c->opts.gc_reserve_percent);
@@ -1745,7 +1691,8 @@ static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX);
mutex_lock(&wp->lock);
- writepoint_drop_ptrs(c, wp, &not_self, wp->nr_ptrs);
+ writepoint_drop_ptrs(c, wp, dev_to_target(ca->dev_idx),
+ true, wp->nr_ptrs);
mutex_unlock(&wp->lock);
}
@@ -1776,7 +1723,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
/* First, remove device from allocation groups: */
- clear_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d);
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
clear_bit(ca->dev_idx, c->rw_devs[i].d);
@@ -1790,7 +1736,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
bch2_stop_write_point(c, ca, &c->write_points[i]);
bch2_stop_write_point(c, ca, &ca->copygc_write_point);
- bch2_stop_write_point(c, ca, &c->tiers[ca->mi.tier].wp);
+ bch2_stop_write_point(c, ca, &c->rebalance_write_point);
bch2_stop_write_point(c, ca, &c->btree_write_point);
mutex_lock(&c->btree_reserve_cache_lock);
@@ -1828,7 +1774,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
if (ca->mi.data_allowed & (1 << i))
set_bit(ca->dev_idx, c->rw_devs[i].d);
- set_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d);
}
/* stop allocator thread: */
@@ -2059,7 +2004,6 @@ void bch2_fs_allocator_init(struct bch_fs *c)
{
struct open_bucket *ob;
struct write_point *wp;
- unsigned i;
mutex_init(&c->write_points_hash_lock);
spin_lock_init(&c->freelist_lock);
@@ -2079,9 +2023,7 @@ void bch2_fs_allocator_init(struct bch_fs *c)
}
writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
-
- for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
- writepoint_init(&c->tiers[i].wp, BCH_DATA_USER);
+ writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
for (wp = c->write_points;
wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
diff --git a/fs/bcachefs/alloc.h b/fs/bcachefs/alloc.h
index 3bdc294691de..5b58922344a8 100644
--- a/fs/bcachefs/alloc.h
+++ b/fs/bcachefs/alloc.h
@@ -66,7 +66,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
}
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
- struct bch_devs_mask *,
+ unsigned,
struct write_point_specifier,
struct bch_devs_list *,
unsigned, unsigned,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e50aa3dd49bc..75f3a0061279 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -408,6 +408,8 @@ struct bch_dev {
struct bch_pd_controller copygc_pd;
struct write_point copygc_write_point;
+ atomic64_t rebalance_work;
+
struct journal_device journal;
struct work_struct io_error_work;
@@ -458,15 +460,6 @@ struct btree_debug {
struct dentry *failed;
};
-struct bch_tier {
- unsigned idx;
- struct task_struct *migrate;
- struct bch_pd_controller pd;
-
- struct bch_devs_mask devs;
- struct write_point wp;
-};
-
enum bch_fs_state {
BCH_FS_STARTING = 0,
BCH_FS_STOPPING,
@@ -570,16 +563,13 @@ struct bch_fs {
struct delayed_work pd_controllers_update;
unsigned pd_controllers_update_seconds;
+ /* REBALANCE */
+ struct task_struct *rebalance_thread;
+ struct bch_pd_controller rebalance_pd;
+
+ atomic64_t rebalance_work_unknown_dev;
- /*
- * These contain all r/w devices - i.e. devices we can currently
- * allocate from:
- */
struct bch_devs_mask rw_devs[BCH_DATA_NR];
- struct bch_tier tiers[BCH_TIER_MAX];
- /* NULL if we only have devices in one tier: */
- struct bch_devs_mask *fastest_devs;
- struct bch_tier *fastest_tier;
u64 capacity; /* sectors */
@@ -616,6 +606,7 @@ struct bch_fs {
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
struct write_point btree_write_point;
+ struct write_point rebalance_write_point;
struct write_point write_points[WRITE_POINT_COUNT];
struct hlist_head write_points_hash[WRITE_POINT_COUNT];
@@ -718,8 +709,8 @@ struct bch_fs {
unsigned btree_gc_periodic:1;
unsigned copy_gc_enabled:1;
- unsigned tiering_enabled:1;
- unsigned tiering_percent;
+ unsigned rebalance_enabled:1;
+ unsigned rebalance_percent;
#define BCH_DEBUG_PARAM(name, description) bool name;
BCH_DEBUG_PARAMS_ALL()
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f70977dbf4b9..0f2c9cecda72 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -610,14 +610,20 @@ BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION);
BCH_INODE_FIELD(bi_compression, 8) \
BCH_INODE_FIELD(bi_project, 32) \
BCH_INODE_FIELD(bi_background_compression, 8) \
- BCH_INODE_FIELD(bi_data_replicas, 8)
+ BCH_INODE_FIELD(bi_data_replicas, 8) \
+ BCH_INODE_FIELD(bi_promote_target, 16) \
+ BCH_INODE_FIELD(bi_foreground_target, 16) \
+ BCH_INODE_FIELD(bi_background_target, 16)
#define BCH_INODE_FIELDS_INHERIT() \
BCH_INODE_FIELD(bi_data_checksum) \
BCH_INODE_FIELD(bi_compression) \
BCH_INODE_FIELD(bi_project) \
BCH_INODE_FIELD(bi_background_compression) \
- BCH_INODE_FIELD(bi_data_replicas)
+ BCH_INODE_FIELD(bi_data_replicas) \
+ BCH_INODE_FIELD(bi_promote_target) \
+ BCH_INODE_FIELD(bi_foreground_target) \
+ BCH_INODE_FIELD(bi_background_target)
enum {
/*
@@ -818,13 +824,14 @@ struct bch_member {
};
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4)
-LE64_BITMASK(BCH_MEMBER_TIER, struct bch_member, flags[0], 4, 8)
-/* 8-10 unused, was HAS_(META)DATA */
+/* 4-10 unused, was TIER, HAS_(META)DATA */
LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14)
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15)
LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20)
LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28)
+#define BCH_TIER_MAX 4U
+
#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
@@ -838,8 +845,6 @@ enum bch_member_state {
BCH_MEMBER_STATE_NR = 4,
};
-#define BCH_TIER_MAX 4U
-
enum cache_replacement {
CACHE_REPLACEMENT_LRU = 0,
CACHE_REPLACEMENT_FIFO = 1,
@@ -1084,6 +1089,10 @@ LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
struct bch_sb, flags[1], 28, 32);
+LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40);
+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52);
+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64);
+
/* Features: */
enum bch_sb_features {
BCH_FEATURE_LZ4 = 0,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 0e0156d9016c..f42239dab71c 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -348,7 +348,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
- wp = bch2_alloc_sectors_start(c, NULL,
+ wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
writepoint_ptr(&c->btree_write_point),
&devs_have,
res->nr_replicas,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index ce1f8ba23035..37470f86e588 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1766,7 +1766,6 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
unsigned seq, stale;
char buf[160];
bool bad;
- unsigned ptrs_per_tier[BCH_TIER_MAX];
unsigned replicas = 0;
/*
@@ -1778,12 +1777,9 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
* going to get overwritten during replay)
*/
- memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
-
extent_for_each_ptr(e, ptr) {
ca = bch_dev_bkey_exists(c, ptr->dev);
replicas++;
- ptrs_per_tier[ca->mi.tier]++;
/*
* If journal replay hasn't finished, we might be seeing keys
@@ -1886,12 +1882,6 @@ static void bch2_extent_to_text(struct bch_fs *c, char *buf,
#undef p
}
-static unsigned PTR_TIER(struct bch_fs *c,
- const struct bch_extent_ptr *ptr)
-{
- return bch_dev_bkey_exists(c, ptr->dev)->mi.tier;
-}
-
static void bch2_extent_crc_init(union bch_extent_crc *crc,
struct bch_extent_crc_unpacked new)
{
@@ -2014,45 +2004,31 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
void bch2_extent_mark_replicas_cached(struct bch_fs *c,
struct bkey_s_extent e,
- unsigned nr_desired_replicas)
+ unsigned nr_desired_replicas,
+ unsigned target)
{
struct bch_extent_ptr *ptr;
- unsigned tier = 0, nr_cached = 0;
- unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
- bool have_higher_tier;
+ unsigned nr_cached = 0, nr_good = bch2_extent_nr_good_ptrs(c, e.c);
if (nr_good <= nr_desired_replicas)
return;
nr_cached = nr_good - nr_desired_replicas;
- do {
- have_higher_tier = false;
-
- extent_for_each_ptr(e, ptr) {
- if (!ptr->cached &&
- PTR_TIER(c, ptr) == tier) {
- ptr->cached = true;
- nr_cached--;
- if (!nr_cached)
- return;
- }
-
- if (PTR_TIER(c, ptr) > tier)
- have_higher_tier = true;
+ extent_for_each_ptr(e, ptr)
+ if (!ptr->cached &&
+ !dev_in_target(c->devs[ptr->dev], target)) {
+ ptr->cached = true;
+ nr_cached--;
+ if (!nr_cached)
+ return;
}
-
- tier++;
- } while (have_higher_tier);
}
/*
- * This picks a non-stale pointer, preferabbly from a device other than
- * avoid. Avoid can be NULL, meaning pick any. If there are no non-stale
- * pointers to other devices, it will still pick a pointer from avoid.
- * Note that it prefers lowered-numbered pointers to higher-numbered pointers
- * as the pointers are sorted by tier, hence preferring pointers to tier 0
- * rather than pointers to tier 1.
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
*/
void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
struct bch_devs_mask *avoid,
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 75579273fae8..83c0f24db588 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -39,7 +39,7 @@ bch2_insert_fixup_extent(struct btree_insert *,
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
- unsigned);
+ unsigned, unsigned);
const struct bch_extent_ptr *
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index d7fb28cc95c7..46cffc5c9b7a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -504,10 +504,8 @@ static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
op->unalloc = false;
op->new_i_size = U64_MAX;
- bch2_write_op_init(&op->op, c);
- op->op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
- op->op.compression_type = bch2_compression_opt_to_type[opts.compression];
- op->op.devs = c->fastest_devs;
+ bch2_write_op_init(&op->op, c, opts);
+ op->op.target = opts.foreground_target;
op->op.index_update_fn = bchfs_write_index_update;
op_journal_seq_set(&op->op, &inode->ei_journal_seq);
}
@@ -615,8 +613,14 @@ static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *in
struct page *page, bool check_enospc)
{
struct bch_page_state *s = page_state(page), new, old;
+
+ /* XXX: this should not be open coded */
+ unsigned nr_replicas = inode->ei_inode.bi_data_replicas
+ ? inode->ei_inode.bi_data_replicas - 1
+ : c->opts.data_replicas;
+
struct disk_reservation disk_res = bch2_disk_reservation_init(c,
- READ_ONCE(c->opts.data_replicas));
+ nr_replicas);
struct quota_res quota_res = { 0 };
int ret = 0;
@@ -1894,7 +1898,7 @@ static int bch2_direct_IO_write(struct kiocb *req,
goto err;
ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
- c->opts.data_replicas, 0);
+ dio->iop.op.opts.data_replicas, 0);
if (unlikely(ret)) {
if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
offset >> 9),
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 13495d487a68..6624d8af574f 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -22,6 +22,7 @@
#include "move.h"
#include "super.h"
#include "super-io.h"
+#include "tier.h"
#include <linux/blkdev.h>
#include <linux/random.h>
@@ -220,9 +221,9 @@ int bch2_write_index_default(struct bch_write_op *op)
BTREE_ITER_INTENT);
ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
- NULL, op_journal_seq(op),
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE);
+ NULL, op_journal_seq(op),
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE);
bch2_btree_iter_unlock(&iter);
return ret;
@@ -238,7 +239,7 @@ static void bch2_write_index(struct closure *cl)
struct keylist *keys = &op->insert_keys;
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
- struct bkey_i *src, *dst = keys->keys, *n;
+ struct bkey_i *src, *dst = keys->keys, *n, *k;
int ret;
op->flags |= BCH_WRITE_LOOPED;
@@ -268,6 +269,14 @@ static void bch2_write_index(struct closure *cl)
keys->top = dst;
+ /*
+ * probably not the ideal place to hook this in, but I don't
+ * particularly want to plumb io_opts all the way through the btree
+ * update stack right now
+ */
+ for_each_keylist_key(keys, k)
+ bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
+
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
int ret = op->index_update_fn(op);
@@ -735,7 +744,7 @@ static void __bch2_write(struct closure *cl)
continue_at(cl, bch2_write_index, index_update_wq(op));
wp = bch2_alloc_sectors_start(c,
- op->devs,
+ op->target,
op->write_point,
&op->devs_have,
op->nr_replicas,
@@ -935,29 +944,32 @@ static struct promote_op *promote_alloc(struct bch_read_bio *rbio,
memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
- ret = bch2_migrate_write_init(c, &op->write, c->fastest_devs,
- writepoint_hashed((unsigned long) current),
- rbio->opts,
- DATA_PROMOTE,
- (struct data_opts) { 0 },
- k);
+ ret = bch2_migrate_write_init(c, &op->write,
+ writepoint_hashed((unsigned long) current),
+ rbio->opts,
+ DATA_PROMOTE,
+ (struct data_opts) {
+ .target = rbio->opts.promote_target
+ },
+ k);
BUG_ON(ret);
return op;
}
-/* only promote if we're not reading from the fastest tier: */
-static bool should_promote(struct bch_fs *c,
- struct extent_pick_ptr *pick, unsigned flags)
+static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e,
+ unsigned flags, u16 target)
{
+ if (!target)
+ return false;
+
if (!(flags & BCH_READ_MAY_PROMOTE))
return false;
if (percpu_ref_is_dying(&c->writes))
return false;
- return c->fastest_tier &&
- c->fastest_tier < c->tiers + pick->ca->mi.tier;
+ return bch2_extent_has_target(c, e, target);
}
/* Read */
@@ -1323,7 +1335,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
bounce = true;
}
- promote = should_promote(c, pick, flags);
+ promote = should_promote(c, e, flags, orig->opts.promote_target);
/* could also set read_full */
if (promote)
bounce = true;
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 4208fd4385bf..bf0b17e1deb9 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -61,24 +61,25 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
int bch2_write_index_default(struct bch_write_op *);
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+ struct bch_io_opts opts)
{
op->c = c;
op->io_wq = index_update_wq(op);
op->flags = 0;
op->written = 0;
op->error = 0;
- op->csum_type = bch2_data_checksum_type(c, c->opts.data_checksum);
- op->compression_type =
- bch2_compression_opt_to_type[c->opts.compression];
+ op->csum_type = bch2_data_checksum_type(c, opts.data_checksum);
+ op->compression_type = bch2_compression_opt_to_type[opts.compression];
op->nr_replicas = 0;
op->nr_replicas_required = c->opts.data_replicas_required;
op->alloc_reserve = RESERVE_NONE;
op->open_buckets_nr = 0;
op->devs_have.nr = 0;
+ op->target = 0;
+ op->opts = opts;
op->pos = POS_MAX;
op->version = ZERO_VERSION;
- op->devs = NULL;
op->write_point = (struct write_point_specifier) { 0 };
op->res = (struct disk_reservation) { 0 };
op->journal_seq = 0;
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 32ecac242288..a022ab335428 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -103,13 +103,14 @@ struct bch_write_op {
u16 target;
u16 nonce;
+ struct bch_io_opts opts;
+
struct bpos pos;
struct bversion version;
/* For BCH_WRITE_DATA_ENCODED: */
struct bch_extent_crc_unpacked crc;
- struct bch_devs_mask *devs;
struct write_point_specifier write_point;
struct disk_reservation res;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index cee63dbf3ca5..a7c4c3ac1da5 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -125,7 +125,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
(struct bch_extent_crc_unpacked) { 0 });
bch2_extent_normalize(c, extent_i_to_s(insert).s);
bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
- c->opts.data_replicas);
+ op->opts.background_target,
+ op->opts.data_replicas);
/*
* It's possible we race, and for whatever reason the extent now
@@ -215,7 +216,6 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
}
int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
- struct bch_devs_mask *devs,
struct write_point_specifier wp,
struct bch_io_opts io_opts,
enum data_cmd data_cmd,
@@ -228,12 +228,11 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
m->data_opts = data_opts;
m->nr_ptrs_reserved = bch2_extent_nr_dirty_ptrs(k);
- bch2_write_op_init(&m->op, c);
- m->op.csum_type = bch2_data_checksum_type(c, io_opts.data_checksum);
+ bch2_write_op_init(&m->op, c, io_opts);
m->op.compression_type =
bch2_compression_opt_to_type[io_opts.background_compression ?:
io_opts.compression];
- m->op.devs = devs;
+ m->op.target = data_opts.target,
m->op.write_point = wp;
if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
@@ -251,8 +250,8 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
switch (data_cmd) {
case DATA_ADD_REPLICAS:
- if (m->nr_ptrs_reserved < c->opts.data_replicas) {
- m->op.nr_replicas = c->opts.data_replicas - m->nr_ptrs_reserved;
+ if (m->nr_ptrs_reserved < io_opts.data_replicas) {
+ m->op.nr_replicas = io_opts.data_replicas - m->nr_ptrs_reserved;
ret = bch2_disk_reservation_get(c, &m->op.res,
k.k->size,
@@ -260,7 +259,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
if (ret)
return ret;
- m->nr_ptrs_reserved = c->opts.data_replicas;
+ m->nr_ptrs_reserved = io_opts.data_replicas;
}
break;
case DATA_REWRITE:
@@ -370,7 +369,6 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
static int bch2_move_extent(struct bch_fs *c,
struct moving_context *ctxt,
- struct bch_devs_mask *devs,
struct write_point_specifier wp,
struct bch_io_opts io_opts,
struct bkey_s_c_extent e,
@@ -429,8 +427,8 @@ static int bch2_move_extent(struct bch_fs *c,
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k);
io->rbio.bio.bi_end_io = move_read_endio;
- ret = bch2_migrate_write_init(c, &io->write, devs, wp,
- io_opts, data_cmd, data_opts, e.s_c);
+ ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
+ data_cmd, data_opts, e.s_c);
if (ret)
goto err_free_pages;
@@ -461,7 +459,6 @@ err:
int bch2_move_data(struct bch_fs *c,
struct bch_ratelimit *rate,
- struct bch_devs_mask *devs,
struct write_point_specifier wp,
struct bpos start,
struct bpos end,
@@ -543,7 +540,7 @@ peek:
k = bkey_i_to_s_c(&tmp.k);
bch2_btree_iter_unlock(&stats->iter);
- ret2 = bch2_move_extent(c, &ctxt, devs, wp, io_opts,
+ ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
bkey_s_c_to_extent(k),
data_cmd, data_opts);
if (ret2) {
@@ -694,11 +691,12 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
unsigned nr_good = bch2_extent_nr_good_ptrs(c, e);
unsigned replicas = type == BKEY_TYPE_BTREE
? c->opts.metadata_replicas
- : c->opts.data_replicas;
+ : io_opts->data_replicas;
if (!nr_good || nr_good >= replicas)
return DATA_SKIP;
+ data_opts->target = 0;
data_opts->btree_insert_flags = 0;
return DATA_ADD_REPLICAS;
}
@@ -714,6 +712,7 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
if (!bch2_extent_has_device(e, op->migrate.dev))
return DATA_SKIP;
+ data_opts->target = 0;
data_opts->btree_insert_flags = 0;
data_opts->rewrite_dev = op->migrate.dev;
return DATA_REWRITE;
@@ -734,7 +733,6 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_gc_btree_replicas(c) ?: ret;
ret = bch2_move_data(c, NULL,
- NULL,
writepoint_hashed((unsigned long) current),
op.start,
op.end,
@@ -752,7 +750,6 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_gc_btree_replicas(c) ?: ret;
ret = bch2_move_data(c, NULL,
- NULL,
writepoint_hashed((unsigned long) current),
op.start,
op.end,
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index cbcc44db9978..bc98f94bb23d 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -17,6 +17,7 @@ enum data_cmd {
};
struct data_opts {
+ u16 target;
unsigned rewrite_dev;
int btree_insert_flags;
};
@@ -38,7 +39,6 @@ struct migrate_write {
void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
- struct bch_devs_mask *,
struct write_point_specifier,
struct bch_io_opts,
enum data_cmd, struct data_opts,
@@ -59,7 +59,6 @@ struct bch_move_stats {
};
int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
- struct bch_devs_mask *,
struct write_point_specifier,
struct bpos, struct bpos,
move_pred_fn, void *,
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index f6810c3ed52e..ad56e039163b 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -21,6 +21,7 @@
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/math64.h>
+#include <linux/sched/task.h>
#include <linux/sort.h>
#include <linux/wait.h>
@@ -94,7 +95,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
if (!__copygc_pred(ca, e))
return DATA_SKIP;
- data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE,
+ data_opts->target = dev_to_target(ca->dev_idx);
+ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE;
data_opts->rewrite_dev = ca->dev_idx;
return DATA_REWRITE;
}
@@ -178,7 +180,6 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
bucket_offset_cmp, NULL);
ret = bch2_move_data(c, &ca->copygc_pd.rate,
- &ca->self,
writepoint_ptr(&ca->copygc_write_point),
POS_MIN, POS_MAX,
copygc_pred, ca,
@@ -247,8 +248,10 @@ void bch2_copygc_stop(struct bch_dev *ca)
ca->copygc_pd.rate.rate = UINT_MAX;
bch2_ratelimit_reset(&ca->copygc_pd.rate);
- if (ca->copygc_thread)
+ if (ca->copygc_thread) {
kthread_stop(ca->copygc_thread);
+ put_task_struct(ca->copygc_thread);
+ }
ca->copygc_thread = NULL;
}
@@ -268,6 +271,8 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
if (IS_ERR(t))
return PTR_ERR(t);
+ get_task_struct(t);
+
ca->copygc_thread = t;
wake_up_process(ca->copygc_thread);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index a0bc2ca6dba0..ec4e36ab8dd8 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -100,6 +100,15 @@ enum opt_type {
BCH_OPT(str_hash, u8, OPT_RUNTIME, \
OPT_STR(bch2_str_hash_types), \
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_SIPHASH) \
+ BCH_OPT(foreground_target, u16, OPT_RUNTIME, \
+ OPT_UINT(0, U16_MAX), \
+ BCH_SB_FOREGROUND_TARGET, 0) \
+ BCH_OPT(background_target, u16, OPT_RUNTIME, \
+ OPT_UINT(0, U16_MAX), \
+ BCH_SB_BACKGROUND_TARGET, 0) \
+ BCH_OPT(promote_target, u16, OPT_RUNTIME, \
+ OPT_UINT(0, U16_MAX), \
+ BCH_SB_PROMOTE_TARGET, 0) \
BCH_OPT(inodes_32bit, u8, OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_INODE_32BIT, false) \
@@ -243,7 +252,10 @@ int bch2_parse_mount_opts(struct bch_opts *, char *);
BCH_INODE_OPT(data_checksum, 8) \
BCH_INODE_OPT(compression, 8) \
BCH_INODE_OPT(background_compression, 8) \
- BCH_INODE_OPT(data_replicas, 8)
+ BCH_INODE_OPT(data_replicas, 8) \
+ BCH_INODE_OPT(promote_target, 16) \
+ BCH_INODE_OPT(foreground_target, 16) \
+ BCH_INODE_OPT(background_target, 16)
struct bch_io_opts {
#define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 1c4f67cc4bd4..ba32e69d2a98 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1601,24 +1601,22 @@ static const char *bch2_sb_validate_quota(struct bch_sb *sb,
/* Disk groups: */
-#if 0
-static size_t trim_nulls(const char *str, size_t len)
+static int strcmp_void(const void *l, const void *r)
{
- while (len && !str[len - 1])
- --len;
- return len;
+ return strcmp(l, r);
}
-#endif
static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_disk_groups *groups =
field_to_type(f, disk_groups);
+ struct bch_disk_group *g;
struct bch_sb_field_members *mi;
struct bch_member *m;
- struct bch_disk_group *g;
- unsigned nr_groups;
+ unsigned i, nr_groups, nr_live = 0, len;
+ char **labels, *l;
+ const char *err = NULL;
mi = bch2_sb_get_members(sb);
groups = bch2_sb_get_disk_groups(sb);
@@ -1627,32 +1625,57 @@ static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb,
for (m = mi->members;
m < mi->members + sb->nr_devices;
m++) {
+ unsigned g;
+
if (!BCH_MEMBER_GROUP(m))
continue;
- if (BCH_MEMBER_GROUP(m) >= nr_groups)
- return "disk has invalid group";
+ g = BCH_MEMBER_GROUP(m) - 1;
- g = &groups->entries[BCH_MEMBER_GROUP(m)];
- if (BCH_GROUP_DELETED(g))
+ if (g >= nr_groups ||
+ BCH_GROUP_DELETED(&groups->entries[g]))
return "disk has invalid group";
}
-#if 0
- if (!groups)
+
+ if (!nr_groups)
return NULL;
- char **labels;
labels = kcalloc(nr_groups, sizeof(char *), GFP_KERNEL);
if (!labels)
return "cannot allocate memory";
- for (g = groups->groups;
- g < groups->groups + nr_groups;
+ for (g = groups->entries;
+ g < groups->entries + nr_groups;
g++) {
+ if (BCH_GROUP_DELETED(g))
+ continue;
+
+ len = strnlen(g->label, sizeof(g->label));
+
+ labels[nr_live++] = l = kmalloc(len + 1, GFP_KERNEL);
+ if (!l) {
+ err = "cannot allocate memory";
+ goto err;
+ }
+ memcpy(l, g->label, len);
+ l[len] = '\0';
}
-#endif
- return NULL;
+
+ sort(labels, nr_live, sizeof(labels[0]), strcmp_void, NULL);
+
+ for (i = 0; i + 1 < nr_live; i++)
+ if (!strcmp(labels[i], labels[i + 1])) {
+ err = "duplicate group labels";
+ goto err;
+ }
+
+ err = NULL;
+err:
+ for (i = 0; i < nr_live; i++)
+ kfree(labels[i]);
+ kfree(labels);
+ return err;
}
static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
@@ -1693,7 +1716,11 @@ static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
if (!bch2_member_exists(m))
continue;
- __set_bit(i, dst->devs.d);
+ dst = BCH_MEMBER_GROUP(m)
+ ? &cpu_g->entries[BCH_MEMBER_GROUP(m) - 1]
+ : NULL;
+ if (dst)
+ __set_bit(i, dst->devs.d);
}
old_g = c->disk_groups;
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index d7fecf02f81c..cd8bfe6cfea8 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -129,7 +129,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
.bucket_size = le16_to_cpu(mi->bucket_size),
.group = BCH_MEMBER_GROUP(mi),
.state = BCH_MEMBER_STATE(mi),
- .tier = BCH_MEMBER_TIER(mi),
.replacement = BCH_MEMBER_REPLACEMENT(mi),
.discard = BCH_MEMBER_DISCARD(mi),
.data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
@@ -204,27 +203,34 @@ struct target {
};
};
+#define TARGET_DEV_START 1
+#define TARGET_GROUP_START (256 + TARGET_DEV_START)
+
static inline u16 dev_to_target(unsigned dev)
{
- return 1 + dev;
+ return TARGET_DEV_START + dev;
}
static inline u16 group_to_target(unsigned group)
{
- return 1 + U8_MAX + group;
+ return TARGET_GROUP_START + group;
}
static inline struct target target_decode(unsigned target)
{
- if (!target)
- return (struct target) { .type = TARGET_NULL };
+ if (target >= TARGET_GROUP_START)
+ return (struct target) {
+ .type = TARGET_GROUP,
+ .group = target - TARGET_GROUP_START
+ };
- --target;
- if (target <= U8_MAX)
- return (struct target) { .type = TARGET_DEV, .dev = target };
+ if (target >= TARGET_DEV_START)
+ return (struct target) {
+ .type = TARGET_DEV,
+ .group = target - TARGET_DEV_START
+ };
- target -= U8_MAX;
- return (struct target) { .type = TARGET_GROUP, .group = target };
+ return (struct target) { .type = TARGET_NULL };
}
static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
@@ -232,6 +238,8 @@ static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
struct target t = target_decode(target);
switch (t.type) {
+ case TARGET_NULL:
+ return false;
case TARGET_DEV:
return ca->dev_idx == t.dev;
case TARGET_GROUP:
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 777d60d3a4de..287535e9c4f7 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -149,6 +149,7 @@ int bch2_congested(void *data, int bdi_bits)
unsigned i;
int ret = 0;
+ rcu_read_lock();
if (bdi_bits & (1 << WB_sync_congested)) {
/* Reads - check all devices: */
for_each_readable_member(ca, c, i) {
@@ -160,12 +161,11 @@ int bch2_congested(void *data, int bdi_bits)
}
}
} else {
- /* Writes prefer fastest tier: */
- struct bch_tier *tier = READ_ONCE(c->fastest_tier);
- struct bch_devs_mask *devs =
- tier ? &tier->devs : &c->rw_devs[BCH_DATA_USER];
+ unsigned target = READ_ONCE(c->opts.foreground_target);
+ const struct bch_devs_mask *devs = target
+ ? bch2_target_to_mask(c, target)
+ : &c->rw_devs[BCH_DATA_USER];
- rcu_read_lock();
for_each_member_device_rcu(ca, c, i, devs) {
bdi = ca->disk_sb.bdev->bd_bdi;
@@ -174,8 +174,8 @@ int bch2_congested(void *data, int bdi_bits)
break;
}
}
- rcu_read_unlock();
}
+ rcu_read_unlock();
return ret;
}
@@ -185,9 +185,9 @@ int bch2_congested(void *data, int bdi_bits)
/*
* For startup/shutdown of RW stuff, the dependencies are:
*
- * - foreground writes depend on copygc and tiering (to free up space)
+ * - foreground writes depend on copygc and rebalance (to free up space)
*
- * - copygc and tiering depend on mark and sweep gc (they actually probably
+ * - copygc and rebalance depend on mark and sweep gc (they actually probably
* don't because they either reserve ahead of time or don't block if
* allocations fail, but allocations can require mark and sweep gc to run
* because of generation number wraparound)
@@ -225,7 +225,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
struct bch_dev *ca;
unsigned i;
- bch2_tiering_stop(c);
+ bch2_rebalance_stop(c);
for_each_member_device(ca, c, i)
bch2_copygc_stop(ca);
@@ -385,8 +385,8 @@ const char *bch2_fs_read_write(struct bch_fs *c)
goto err;
}
- err = "error starting tiering thread";
- if (bch2_tiering_start(c))
+ err = "error starting rebalance thread";
+ if (bch2_rebalance_start(c))
goto err;
schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
@@ -531,7 +531,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
#undef BCH_TIME_STAT
bch2_fs_allocator_init(c);
- bch2_fs_tiering_init(c);
+ bch2_fs_rebalance_init(c);
bch2_fs_quota_init(c);
INIT_LIST_HEAD(&c->list);
@@ -555,8 +555,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
c->copy_gc_enabled = 1;
- c->tiering_enabled = 1;
- c->tiering_percent = 10;
+ c->rebalance_enabled = 1;
+ c->rebalance_percent = 10;
c->journal.write_time = &c->journal_write_time;
c->journal.delay_time = &c->journal_delay_time;
@@ -1215,6 +1215,8 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
if (ca->mi.state == BCH_MEMBER_STATE_RW)
bch2_dev_allocator_add(c, ca);
+ rebalance_wakeup(c);
+
percpu_ref_reinit(&ca->io_ref);
return 0;
}
@@ -1339,9 +1341,6 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
if (bch2_copygc_start(c, ca))
return "error starting copygc thread";
- if (bch2_tiering_start(c))
- return "error starting tiering thread";
-
return NULL;
}
@@ -1349,6 +1348,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags)
{
struct bch_sb_field_members *mi;
+ int ret = 0;
if (ca->mi.state == new_state)
return 0;
@@ -1367,10 +1367,13 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- if (new_state == BCH_MEMBER_STATE_RW)
- return __bch2_dev_read_write(c, ca) ? -ENOMEM : 0;
+ if (new_state == BCH_MEMBER_STATE_RW &&
+ __bch2_dev_read_write(c, ca))
+ ret = -ENOMEM;
- return 0;
+ rebalance_wakeup(c);
+
+ return ret;
}
int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index d76d917cb039..3be05e9b0888 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -24,7 +24,6 @@ struct bch_member_cpu {
u16 bucket_size; /* sectors */
u16 group;
u8 state;
- u8 tier;
u8 replacement;
u8 discard;
u8 data_allowed;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 722aa2642b92..34c5f9029bd6 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -172,11 +172,9 @@ rw_attribute(cache_replacement_policy);
rw_attribute(copy_gc_enabled);
sysfs_pd_controller_attribute(copy_gc);
-rw_attribute(tier);
-rw_attribute(tiering_enabled);
-rw_attribute(tiering_percent);
-sysfs_pd_controller_attribute(tiering);
-
+rw_attribute(rebalance_enabled);
+rw_attribute(rebalance_percent);
+sysfs_pd_controller_attribute(rebalance);
rw_attribute(pd_controllers_update_seconds);
@@ -332,10 +330,10 @@ SHOW(bch2_fs)
sysfs_print(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
- sysfs_printf(tiering_enabled, "%i", c->tiering_enabled);
- sysfs_print(tiering_percent, c->tiering_percent);
+ sysfs_printf(rebalance_enabled, "%i", c->rebalance_enabled);
+ sysfs_print(rebalance_percent, c->rebalance_percent);
- sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */
+ sysfs_pd_controller_show(rebalance, &c->rebalance_pd); /* XXX */
sysfs_printf(meta_replicas_have, "%u", bch2_replicas_online(c, true));
sysfs_printf(data_replicas_have, "%u", bch2_replicas_online(c, false));
@@ -397,19 +395,19 @@ STORE(__bch2_fs)
return ret;
}
- if (attr == &sysfs_tiering_enabled) {
- ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
+ if (attr == &sysfs_rebalance_enabled) {
+ ssize_t ret = strtoul_safe(buf, c->rebalance_enabled)
?: (ssize_t) size;
- bch2_tiering_start(c); /* issue wakeups */
+ rebalance_wakeup(c);
return ret;
}
sysfs_strtoul(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
- sysfs_strtoul(tiering_percent, c->tiering_percent);
- sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */
+ sysfs_strtoul(rebalance_percent, c->rebalance_percent);
+ sysfs_pd_controller_store(rebalance, &c->rebalance_pd);
/* Debugging: */
@@ -468,7 +466,7 @@ struct attribute *bch2_fs_files[] = {
&sysfs_writeback_pages_max,
- &sysfs_tiering_percent,
+ &sysfs_rebalance_percent,
&sysfs_compression_stats,
NULL
@@ -506,8 +504,8 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_prune_cache,
&sysfs_copy_gc_enabled,
- &sysfs_tiering_enabled,
- sysfs_pd_controller_files(tiering),
+ &sysfs_rebalance_enabled,
+ sysfs_pd_controller_files(rebalance),
&sysfs_internal_uuid,
#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@@ -564,6 +562,12 @@ STORE(bch2_fs_opts_dir)
bch2_opt_set_by_id(&c->opts, id, v);
+ if ((id == Opt_background_target ||
+ id == Opt_background_compression) && v) {
+ bch2_rebalance_add_work(c, S64_MAX);
+ rebalance_wakeup(c);
+ }
+
return size;
}
SYSFS_OPS(bch2_fs_opts_dir);
@@ -826,8 +830,6 @@ SHOW(bch2_dev)
return out - buf;
}
- sysfs_print(tier, ca->mi.tier);
-
if (attr == &sysfs_state_rw) {
out += bch2_scnprint_string_list(out, end - out,
bch2_dev_state,
@@ -891,31 +893,6 @@ STORE(bch2_dev)
mutex_unlock(&c->sb_lock);
}
- if (attr == &sysfs_tier) {
- unsigned prev_tier;
- unsigned v = strtoul_restrict_or_return(buf,
- 0, BCH_TIER_MAX - 1);
-
- mutex_lock(&c->sb_lock);
- prev_tier = ca->mi.tier;
-
- if (v == ca->mi.tier) {
- mutex_unlock(&c->sb_lock);
- return size;
- }
-
- mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
- SET_BCH_MEMBER_TIER(mi, v);
- bch2_write_super(c);
-
- clear_bit(ca->dev_idx, c->tiers[prev_tier].devs.d);
- set_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d);
- mutex_unlock(&c->sb_lock);
-
- bch2_recalc_capacity(c);
- bch2_tiering_start(c);
- }
-
if (attr == &sysfs_wake_allocator)
bch2_wake_allocator(ca);
@@ -933,7 +910,6 @@ struct attribute *bch2_dev_files[] = {
/* settings: */
&sysfs_discard,
&sysfs_cache_replacement_policy,
- &sysfs_tier,
&sysfs_state_rw,
&sysfs_has_data,
diff --git a/fs/bcachefs/tier.c b/fs/bcachefs/tier.c
index e992ab44ed3a..211a844c69cf 100644
--- a/fs/bcachefs/tier.c
+++ b/fs/bcachefs/tier.c
@@ -12,172 +12,247 @@
#include <linux/freezer.h>
#include <linux/kthread.h>
+#include <linux/sched/cputime.h>
#include <trace/events/bcachefs.h>
-static bool __tiering_pred(struct bch_fs *c, struct bch_tier *tier,
- struct bkey_s_c_extent e)
+static inline bool rebalance_ptr_pred(struct bch_fs *c,
+ const struct bch_extent_ptr *ptr,
+ struct bch_extent_crc_unpacked crc,
+ struct bch_io_opts *io_opts)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ if (io_opts->background_target &&
+ !dev_in_target(ca, io_opts->background_target) &&
+ !ptr->cached)
+ return true;
+
+ if (io_opts->background_compression &&
+ crc.compression_type !=
+ bch2_compression_opt_to_type[io_opts->background_compression])
+ return true;
+
+ return false;
+}
+
+void bch2_rebalance_add_key(struct bch_fs *c,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts)
+{
+ const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
+ struct bkey_s_c_extent e;
+
+ if (!bkey_extent_is_data(k.k))
+ return;
+
+ if (!io_opts->background_target &&
+ !io_opts->background_compression)
+ return;
+
+ e = bkey_s_c_to_extent(k);
+
+ extent_for_each_ptr_crc(e, ptr, crc)
+ if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ if (!atomic64_add_return(crc.compressed_size,
+ &ca->rebalance_work))
+ rebalance_wakeup(c);
+ }
+}
+
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+ if (!atomic64_add_return(sectors, &c->rebalance_work_unknown_dev))
+ rebalance_wakeup(c);
+}
+
+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
+ enum bkey_type type,
+ struct bkey_s_c_extent e,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
{
const struct bch_extent_ptr *ptr;
- unsigned replicas = 0;
+ struct bch_extent_crc_unpacked crc;
/* Make sure we have room to add a new pointer: */
if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
BKEY_EXTENT_VAL_U64s_MAX)
- return false;
+ return DATA_SKIP;
- extent_for_each_ptr(e, ptr)
- if (bch_dev_bkey_exists(c, ptr->dev)->mi.tier >= tier->idx)
- replicas++;
+ extent_for_each_ptr_crc(e, ptr, crc)
+ if (rebalance_ptr_pred(c, ptr, crc, io_opts))
+ goto found;
- return replicas < c->opts.data_replicas;
+ return DATA_SKIP;
+found:
+ data_opts->target = io_opts->background_target;
+ data_opts->btree_insert_flags = 0;
+ return DATA_ADD_REPLICAS;
}
-static enum data_cmd tiering_pred(struct bch_fs *c, void *arg,
- enum bkey_type type,
- struct bkey_s_c_extent e,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+struct rebalance_work {
+ unsigned dev_most_full_percent;
+ u64 dev_most_full_work;
+ u64 dev_most_full_capacity;
+ u64 total_work;
+};
+
+static struct rebalance_work rebalance_work(struct bch_fs *c)
{
- struct bch_tier *tier = arg;
+ struct bch_dev *ca;
+ struct rebalance_work ret = { 0 };
+ unsigned i;
- if (!__tiering_pred(c, tier, e))
- return DATA_SKIP;
+ for_each_online_member(ca, c, i) {
+ u64 capacity = bucket_to_sector(ca, ca->mi.nbuckets -
+ ca->mi.first_bucket);
+ u64 work = atomic64_read(&ca->rebalance_work) +
+ atomic64_read(&c->rebalance_work_unknown_dev);
+ unsigned percent_full = div_u64(work * 100, capacity);
+
+ if (percent_full > ret.dev_most_full_percent) {
+ ret.dev_most_full_percent = percent_full;
+ ret.dev_most_full_work = work;
+ ret.dev_most_full_capacity = capacity;
+ }
- data_opts->btree_insert_flags = 0;
- return DATA_ADD_REPLICAS;
+ ret.total_work += atomic64_read(&ca->rebalance_work);
+ }
+
+ ret.total_work += atomic64_read(&c->rebalance_work_unknown_dev);
+
+ return ret;
}
-static int bch2_tiering_thread(void *arg)
+static void rebalance_work_reset(struct bch_fs *c)
{
- struct bch_tier *tier = arg;
- struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
- struct io_clock *clock = &c->io_clock[WRITE];
struct bch_dev *ca;
- struct bch_move_stats move_stats;
- u64 tier_capacity, available_sectors;
- unsigned long last;
- unsigned i, nr_devices;
+ unsigned i;
+
+ for_each_online_member(ca, c, i)
+ atomic64_set(&ca->rebalance_work, 0);
+
+ atomic64_set(&c->rebalance_work_unknown_dev, 0);
+}
+
+static unsigned long curr_cputime(void)
+{
+ u64 utime, stime;
+
+ task_cputime_adjusted(current, &utime, &stime);
+ return nsecs_to_jiffies(utime + stime);
+}
+
+static int bch2_rebalance_thread(void *arg)
+{
+ struct bch_fs *c = arg;
+ struct io_clock *clock = &c->io_clock[WRITE];
+ struct rebalance_work w, p;
+ unsigned long start, prev_start;
+ unsigned long prev_run_time, prev_run_cputime;
+ unsigned long cputime, prev_cputime;
- memset(&move_stats, 0, sizeof(move_stats));
set_freezable();
- while (!kthread_should_stop()) {
- if (kthread_wait_freezable(c->tiering_enabled &&
- (nr_devices = dev_mask_nr(&tier->devs))))
- break;
-
- while (1) {
- struct bch_tier *faster_tier;
-
- last = atomic_long_read(&clock->now);
-
- tier_capacity = available_sectors = 0;
- for (faster_tier = c->tiers;
- faster_tier != tier;
- faster_tier++) {
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &faster_tier->devs) {
- tier_capacity +=
- bucket_to_sector(ca,
- ca->mi.nbuckets -
- ca->mi.first_bucket);
- available_sectors +=
- bucket_to_sector(ca,
- dev_buckets_available(c, ca));
- }
- rcu_read_unlock();
- }
+ p = rebalance_work(c);
+ prev_start = jiffies;
+ prev_cputime = curr_cputime();
+
+ while (!kthread_wait_freezable(c->rebalance_enabled)) {
+ struct bch_move_stats move_stats = { 0 };
- if (available_sectors < (tier_capacity >> 1))
- break;
+ w = rebalance_work(c);
+ start = jiffies;
+ cputime = curr_cputime();
+
+ prev_run_time = start - prev_start;
+ prev_run_cputime = cputime - prev_cputime;
+
+ if (!w.total_work) {
+ kthread_wait_freezable(rebalance_work(c).total_work);
+ continue;
+ }
- bch2_kthread_io_clock_wait(clock,
- last +
- available_sectors -
- (tier_capacity >> 1));
- if (kthread_should_stop())
- return 0;
+ if (w.dev_most_full_percent < 20 &&
+ prev_run_cputime * 5 > prev_run_time) {
+ if (w.dev_most_full_capacity) {
+ bch2_kthread_io_clock_wait(clock,
+ atomic_long_read(&clock->now) +
+ div_u64(w.dev_most_full_capacity, 5));
+ } else {
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ break;
+
+ schedule_timeout(prev_run_cputime * 5 -
+ prev_run_time);
+ continue;
+ }
}
- bch2_move_data(c, &tier->pd.rate,
- &tier->devs,
- writepoint_ptr(&tier->wp),
+ /* minimum 1 mb/sec: */
+ c->rebalance_pd.rate.rate =
+ max_t(u64, 1 << 11,
+ c->rebalance_pd.rate.rate *
+ max(p.dev_most_full_percent, 1U) /
+ max(w.dev_most_full_percent, 1U));
+
+ rebalance_work_reset(c);
+
+ bch2_move_data(c, &c->rebalance_pd.rate,
+ writepoint_ptr(&c->rebalance_write_point),
POS_MIN, POS_MAX,
- tiering_pred, tier,
+ rebalance_pred, NULL,
&move_stats);
}
return 0;
}
-static void __bch2_tiering_stop(struct bch_tier *tier)
+void bch2_rebalance_stop(struct bch_fs *c)
{
- tier->pd.rate.rate = UINT_MAX;
- bch2_ratelimit_reset(&tier->pd.rate);
-
- if (tier->migrate)
- kthread_stop(tier->migrate);
+ struct task_struct *p;
- tier->migrate = NULL;
-}
+ c->rebalance_pd.rate.rate = UINT_MAX;
+ bch2_ratelimit_reset(&c->rebalance_pd.rate);
-void bch2_tiering_stop(struct bch_fs *c)
-{
- struct bch_tier *tier;
+ p = c->rebalance_thread;
+ c->rebalance_thread = NULL;
- for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++)
- __bch2_tiering_stop(tier);
-}
+ if (p) {
+ /* for sychronizing with rebalance_wakeup() */
+ synchronize_rcu();
-static int __bch2_tiering_start(struct bch_tier *tier)
-{
- if (!tier->migrate) {
- struct task_struct *p =
- kthread_create(bch2_tiering_thread, tier,
- "bch_tier[%u]", tier->idx);
- if (IS_ERR(p))
- return PTR_ERR(p);
-
- tier->migrate = p;
+ kthread_stop(p);
+ put_task_struct(p);
}
-
- wake_up_process(tier->migrate);
- return 0;
}
-int bch2_tiering_start(struct bch_fs *c)
+int bch2_rebalance_start(struct bch_fs *c)
{
- struct bch_tier *tier;
- bool have_faster_tier = false;
+ struct task_struct *p;
if (c->opts.nochanges)
return 0;
- for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
- if (!dev_mask_nr(&tier->devs))
- continue;
-
- if (have_faster_tier) {
- int ret = __bch2_tiering_start(tier);
- if (ret)
- return ret;
- } else {
- __bch2_tiering_stop(tier);
- }
+ p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+ if (IS_ERR(p))
+ return PTR_ERR(p);
- have_faster_tier = true;
- }
+ get_task_struct(p);
+ rcu_assign_pointer(c->rebalance_thread, p);
+ wake_up_process(c->rebalance_thread);
return 0;
}
-void bch2_fs_tiering_init(struct bch_fs *c)
+void bch2_fs_rebalance_init(struct bch_fs *c)
{
- unsigned i;
+ bch2_pd_controller_init(&c->rebalance_pd);
- for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
- c->tiers[i].idx = i;
- bch2_pd_controller_init(&c->tiers[i].pd);
- }
+ atomic64_set(&c->rebalance_work_unknown_dev, S64_MAX);
}
diff --git a/fs/bcachefs/tier.h b/fs/bcachefs/tier.h
index f8eaa9b0e8c9..0c66dfea7c0d 100644
--- a/fs/bcachefs/tier.h
+++ b/fs/bcachefs/tier.h
@@ -1,8 +1,23 @@
#ifndef _BCACHEFS_TIER_H
#define _BCACHEFS_TIER_H
-void bch2_tiering_stop(struct bch_fs *);
-int bch2_tiering_start(struct bch_fs *);
-void bch2_fs_tiering_init(struct bch_fs *);
+static inline void rebalance_wakeup(struct bch_fs *c)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = rcu_dereference(c->rebalance_thread);
+ if (p)
+ wake_up_process(p);
+ rcu_read_unlock();
+}
+
+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
+ struct bch_io_opts *);
+void bch2_rebalance_add_work(struct bch_fs *, u64);
+
+void bch2_rebalance_stop(struct bch_fs *);
+int bch2_rebalance_start(struct bch_fs *);
+void bch2_fs_rebalance_init(struct bch_fs *);
#endif /* _BCACHEFS_TIER_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index c286f3f8859a..70742fc8a5fc 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -6,6 +6,7 @@
#include "extents.h"
#include "fs.h"
#include "str_hash.h"
+#include "tier.h"
#include "xattr.h"
#include <linux/dcache.h>
@@ -457,6 +458,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s);
mutex_unlock(&inode->ei_update_lock);
+ if (value &&
+ (s.id == Opt_background_compression ||
+ s.id == Opt_background_target))
+ bch2_rebalance_add_work(c, inode->v.i_blocks);
+
return ret;
}
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index d132dd8a75e6..a7be2d8222d8 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -49,15 +49,13 @@ DECLARE_EVENT_CLASS(bch_dev,
TP_STRUCT__entry(
__array(char, uuid, 16 )
- __field(unsigned, tier )
),
TP_fast_assign(
memcpy(__entry->uuid, ca->uuid.b, 16);
- __entry->tier = ca->mi.tier;
),
- TP_printk("%pU tier %u", __entry->uuid, __entry->tier)
+ TP_printk("%pU", __entry->uuid)
);
DECLARE_EVENT_CLASS(bch_fs,