summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2017-03-10 07:08:39 -0900
committerKent Overstreet <kent.overstreet@gmail.com>2017-03-17 19:49:23 -0800
commita5b2efedf8485ee4a36c736cf6cfe907c0db91c5 (patch)
tree8dc1073413af469ad03da7132a5d7354aa40dc85
parentde9690db2991d5d3a1f88211e9ef46c3b5a5dae4 (diff)
bcachefs: Rework struct bch_dev lifetime
Allocate all member devices when allocating struct bch_fs, not when they come online - this will let us handle running in degraded mode better, and ends up simplifying things a good bit.
-rw-r--r--fs/bcachefs/alloc.c93
-rw-r--r--fs/bcachefs/alloc.h45
-rw-r--r--fs/bcachefs/bcache.h26
-rw-r--r--fs/bcachefs/bcachefs_format.h2
-rw-r--r--fs/bcachefs/blockdev.c58
-rw-r--r--fs/bcachefs/blockdev.h5
-rw-r--r--fs/bcachefs/btree_gc.c28
-rw-r--r--fs/bcachefs/btree_gc.h2
-rw-r--r--fs/bcachefs/btree_io.c12
-rw-r--r--fs/bcachefs/buckets.c12
-rw-r--r--fs/bcachefs/buckets.h16
-rw-r--r--fs/bcachefs/chardev.c3
-rw-r--r--fs/bcachefs/debug.c2
-rw-r--r--fs/bcachefs/error.c6
-rw-r--r--fs/bcachefs/error.h19
-rw-r--r--fs/bcachefs/extents.c251
-rw-r--r--fs/bcachefs/extents.h7
-rw-r--r--fs/bcachefs/fs.c64
-rw-r--r--fs/bcachefs/io.c17
-rw-r--r--fs/bcachefs/journal.c67
-rw-r--r--fs/bcachefs/journal.h2
-rw-r--r--fs/bcachefs/migrate.c6
-rw-r--r--fs/bcachefs/move.c7
-rw-r--r--fs/bcachefs/notify.c3
-rw-r--r--fs/bcachefs/opts.c2
-rw-r--r--fs/bcachefs/request.c20
-rw-r--r--fs/bcachefs/super-io.c117
-rw-r--r--fs/bcachefs/super-io.h18
-rw-r--r--fs/bcachefs/super.c555
-rw-r--r--fs/bcachefs/super.h67
-rw-r--r--fs/bcachefs/sysfs.c58
-rw-r--r--fs/bcachefs/sysfs.h10
-rw-r--r--fs/bcachefs/tier.c16
33 files changed, 686 insertions, 930 deletions
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index 5937c292b7cb..5bd6de9fb05c 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -74,6 +74,7 @@
#include <trace/events/bcachefs.h>
static void __bch_bucket_free(struct bch_dev *, struct bucket *);
+static void bch_recalc_min_prio(struct bch_dev *, int);
/* Allocation groups: */
@@ -84,7 +85,7 @@ void bch_dev_group_remove(struct dev_group *grp, struct bch_dev *ca)
spin_lock(&grp->lock);
for (i = 0; i < grp->nr; i++)
- if (rcu_access_pointer(grp->d[i].dev) == ca) {
+ if (grp->d[i].dev == ca) {
grp->nr--;
memmove(&grp->d[i],
&grp->d[i + 1],
@@ -101,12 +102,12 @@ void bch_dev_group_add(struct dev_group *grp, struct bch_dev *ca)
spin_lock(&grp->lock);
for (i = 0; i < grp->nr; i++)
- if (rcu_access_pointer(grp->d[i].dev) == ca)
+ if (grp->d[i].dev == ca)
goto out;
BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
- rcu_assign_pointer(grp->d[grp->nr++].dev, ca);
+ grp->d[grp->nr++].dev = ca;
out:
spin_unlock(&grp->lock);
}
@@ -137,7 +138,8 @@ static void pd_controllers_update(struct work_struct *work)
faster_tiers_dirty,
-1);
- group_for_each_dev_rcu(ca, &c->tiers[i].devs, iter) {
+ spin_lock(&c->tiers[i].devs.lock);
+ group_for_each_dev(ca, &c->tiers[i].devs, iter) {
struct bch_dev_usage stats = bch_dev_usage_read(ca);
unsigned bucket_bits = ca->bucket_bits + 9;
@@ -172,6 +174,7 @@ static void pd_controllers_update(struct work_struct *work)
copygc_can_free += fragmented;
}
+ spin_unlock(&c->tiers[i].devs.lock);
}
rcu_read_unlock();
@@ -441,8 +444,15 @@ int bch_prio_read(struct bch_dev *ca)
bucket_cmpxchg(&ca->buckets[b], new, new.gen = d->gen);
}
+
+ mutex_lock(&c->bucket_lock);
+ bch_recalc_min_prio(ca, READ);
+ bch_recalc_min_prio(ca, WRITE);
+ mutex_unlock(&c->bucket_lock);
+
+ ret = 0;
fsck_err:
- return 0;
+ return ret;
}
#define BUCKET_GC_GEN_MAX 96U
@@ -520,6 +530,8 @@ void bch_recalc_min_prio(struct bch_dev *ca, int rw)
u16 max_delta = 1;
unsigned i;
+ lockdep_assert_held(&c->bucket_lock);
+
/* Determine min prio for this particular cache */
for_each_bucket(g, ca)
max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
@@ -821,8 +833,8 @@ static void bch_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca)
spin_lock(&ca->freelist_lock);
bch_mark_alloc_bucket(ca, g, true);
- g->read_prio = ca->fs->prio_clock[READ].hand;
- g->write_prio = ca->fs->prio_clock[WRITE].hand;
+ g->read_prio = c->prio_clock[READ].hand;
+ g->write_prio = c->prio_clock[WRITE].hand;
verify_not_on_freelist(ca, g - ca->buckets);
BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
@@ -1058,7 +1070,6 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct bch_fs *c,
if (ob->nr_ptrs >= nr_replicas)
return ALLOC_SUCCESS;
- rcu_read_lock();
spin_lock(&devs->lock);
for (i = 0; i < devs->nr; i++)
@@ -1128,7 +1139,6 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct bch_fs *c,
err:
EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
spin_unlock(&devs->lock);
- rcu_read_unlock();
return ret;
}
@@ -1223,14 +1233,14 @@ static int bch_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
static void __bch_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
const struct bch_extent_ptr *ptr;
- struct bch_dev *ca;
lockdep_assert_held(&c->open_buckets_lock);
- rcu_read_lock();
- open_bucket_for_each_online_device(c, ob, ptr, ca)
+ open_bucket_for_each_ptr(ob, ptr) {
+ struct bch_dev *ca = c->devs[ptr->dev];
+
bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false);
- rcu_read_unlock();
+ }
ob->nr_ptrs = 0;
@@ -1283,12 +1293,13 @@ static struct open_bucket *bch_open_bucket_get(struct bch_fs *c,
return ret;
}
-static unsigned ob_ptr_sectors_free(struct open_bucket *ob,
- struct bch_member_rcu *mi,
+static unsigned ob_ptr_sectors_free(struct bch_fs *c,
+ struct open_bucket *ob,
struct bch_extent_ptr *ptr)
{
+ struct bch_dev *ca = c->devs[ptr->dev];
unsigned i = ptr - ob->ptrs;
- unsigned bucket_size = mi->m[ptr->dev].bucket_size;
+ unsigned bucket_size = ca->mi.bucket_size;
unsigned used = (ptr->offset & (bucket_size - 1)) +
ob->ptr_offset[i];
@@ -1301,14 +1312,11 @@ static unsigned open_bucket_sectors_free(struct bch_fs *c,
struct open_bucket *ob,
unsigned nr_replicas)
{
- struct bch_member_rcu *mi = fs_member_info_get(c);
unsigned i, sectors_free = UINT_MAX;
for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++)
sectors_free = min(sectors_free,
- ob_ptr_sectors_free(ob, mi, &ob->ptrs[i]));
-
- fs_member_info_put();
+ ob_ptr_sectors_free(c, ob, &ob->ptrs[i]));
return sectors_free != UINT_MAX ? sectors_free : 0;
}
@@ -1317,11 +1325,10 @@ static void open_bucket_copy_unused_ptrs(struct bch_fs *c,
struct open_bucket *new,
struct open_bucket *old)
{
- struct bch_member_rcu *mi = fs_member_info_get(c);
unsigned i;
for (i = 0; i < old->nr_ptrs; i++)
- if (ob_ptr_sectors_free(old, mi, &old->ptrs[i])) {
+ if (ob_ptr_sectors_free(c, old, &old->ptrs[i])) {
struct bch_extent_ptr tmp = old->ptrs[i];
tmp.offset += old->ptr_offset[i];
@@ -1329,19 +1336,18 @@ static void open_bucket_copy_unused_ptrs(struct bch_fs *c,
new->ptr_offset[new->nr_ptrs] = 0;
new->nr_ptrs++;
}
- fs_member_info_put();
}
static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob)
{
#ifdef CONFIG_BCACHEFS_DEBUG
const struct bch_extent_ptr *ptr;
- struct bch_dev *ca;
- rcu_read_lock();
- open_bucket_for_each_online_device(c, ob, ptr, ca)
+ open_bucket_for_each_ptr(ob, ptr) {
+ struct bch_dev *ca = c->devs[ptr->dev];
+
BUG_ON(ptr_stale(ca, ptr));
- rcu_read_unlock();
+ }
#endif
}
@@ -1485,7 +1491,6 @@ void bch_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
unsigned sectors)
{
struct bch_extent_ptr tmp;
- struct bch_dev *ca;
bool has_data = false;
unsigned i;
@@ -1500,8 +1505,6 @@ void bch_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
if (nr_replicas < ob->nr_ptrs)
has_data = true;
- rcu_read_lock();
-
for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) {
EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
@@ -1512,11 +1515,8 @@ void bch_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
ob->ptr_offset[i] += sectors;
- if ((ca = PTR_DEV(c, &ob->ptrs[i])))
- this_cpu_add(*ca->sectors_written, sectors);
+ this_cpu_add(*c->devs[tmp.dev]->sectors_written, sectors);
}
-
- rcu_read_unlock();
}
/*
@@ -1526,19 +1526,16 @@ void bch_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
void bch_alloc_sectors_done(struct bch_fs *c, struct write_point *wp,
struct open_bucket *ob)
{
- struct bch_member_rcu *mi = fs_member_info_get(c);
bool has_data = false;
unsigned i;
for (i = 0; i < ob->nr_ptrs; i++) {
- if (!ob_ptr_sectors_free(ob, mi, &ob->ptrs[i]))
+ if (!ob_ptr_sectors_free(c, ob, &ob->ptrs[i]))
ob->has_full_ptrs = true;
else
has_data = true;
}
- fs_member_info_put();
-
if (likely(has_data))
atomic_inc(&ob->pin);
else
@@ -1600,8 +1597,7 @@ void bch_recalc_capacity(struct bch_fs *c)
unsigned long ra_pages = 0;
unsigned i, j;
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i) {
+ for_each_online_member(ca, c, i) {
struct backing_dev_info *bdi =
blk_get_backing_dev_info(ca->disk_sb.bdev);
@@ -1632,7 +1628,8 @@ void bch_recalc_capacity(struct bch_fs *c)
* Capacity of the filesystem is the capacity of all the devices in the
* slowest (highest) tier - we don't include lower tier devices.
*/
- group_for_each_dev_rcu(ca, &slowest_tier->devs, i) {
+ spin_lock(&slowest_tier->devs.lock);
+ group_for_each_dev(ca, &slowest_tier->devs, i) {
size_t reserve = 0;
/*
@@ -1668,8 +1665,8 @@ void bch_recalc_capacity(struct bch_fs *c)
ca->mi.first_bucket) <<
ca->bucket_bits;
}
+ spin_unlock(&slowest_tier->devs.lock);
set_capacity:
- rcu_read_unlock();
total_capacity = capacity;
capacity *= (100 - c->opts.gc_reserve_percent);
@@ -1828,6 +1825,8 @@ int bch_dev_allocator_start(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
+ struct bch_sb_field_journal *journal_buckets;
+ bool has_journal;
struct task_struct *k;
/*
@@ -1845,7 +1844,15 @@ int bch_dev_allocator_start(struct bch_dev *ca)
bch_dev_group_add(tier, ca);
bch_dev_group_add(&c->all_devs, ca);
- bch_dev_group_add(&c->journal.devs, ca);
+
+ mutex_lock(&c->sb_lock);
+ journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
+ has_journal = bch_nr_journal_buckets(journal_buckets) >=
+ BCH_JOURNAL_BUCKETS_MIN;
+ mutex_unlock(&c->sb_lock);
+
+ if (has_journal)
+ bch_dev_group_add(&c->journal.devs, ca);
bch_recalc_capacity(c);
diff --git a/fs/bcachefs/alloc.h b/fs/bcachefs/alloc.h
index bd50fec8f3c7..f8aa762de2e0 100644
--- a/fs/bcachefs/alloc.h
+++ b/fs/bcachefs/alloc.h
@@ -25,8 +25,6 @@ void bch_dev_group_add(struct dev_group *, struct bch_dev *);
int bch_prio_read(struct bch_dev *);
-void bch_recalc_min_prio(struct bch_dev *, int);
-
size_t bch_bucket_alloc(struct bch_dev *, enum alloc_reserve);
void bch_open_bucket_put(struct bch_fs *, struct open_bucket *);
@@ -56,54 +54,27 @@ static inline void bch_wake_allocator(struct bch_dev *ca)
rcu_read_unlock();
}
-static inline struct bch_dev *dev_group_next_rcu(struct dev_group *devs,
- unsigned *iter)
+static inline struct bch_dev *dev_group_next(struct dev_group *devs,
+ unsigned *iter)
{
struct bch_dev *ret = NULL;
while (*iter < devs->nr &&
- !(ret = rcu_dereference(devs->d[*iter].dev)))
+ !(ret = rcu_dereference_check(devs->d[*iter].dev,
+ lockdep_is_held(&devs->lock))))
(*iter)++;
return ret;
}
-#define group_for_each_dev_rcu(ca, devs, iter) \
+#define group_for_each_dev(ca, devs, iter) \
for ((iter) = 0; \
- ((ca) = dev_group_next_rcu((devs), &(iter))); \
+ ((ca) = dev_group_next((devs), &(iter))); \
(iter)++)
-static inline struct bch_dev *dev_group_next(struct dev_group *devs,
- unsigned *iter)
-{
- struct bch_dev *ret;
-
- rcu_read_lock();
- if ((ret = dev_group_next_rcu(devs, iter)))
- percpu_ref_get(&ret->ref);
- rcu_read_unlock();
-
- return ret;
-}
-
-#define group_for_each_dev(ca, devs, iter) \
- for ((iter) = 0; \
- (ca = dev_group_next(devs, &(iter))); \
- percpu_ref_put(&ca->ref), (iter)++)
-
-#define __open_bucket_next_online_device(_c, _ob, _ptr, _ca) \
-({ \
- (_ca) = NULL; \
- \
- while ((_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs && \
- !((_ca) = PTR_DEV(_c, _ptr))) \
- (_ptr)++; \
- (_ca); \
-})
-
-#define open_bucket_for_each_online_device(_c, _ob, _ptr, _ca) \
+#define open_bucket_for_each_ptr(_ob, _ptr) \
for ((_ptr) = (_ob)->ptrs; \
- ((_ca) = __open_bucket_next_online_device(_c, _ob, _ptr, _ca));\
+ (_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs; \
(_ptr)++)
void bch_recalc_capacity(struct bch_fs *);
diff --git a/fs/bcachefs/bcache.h b/fs/bcachefs/bcache.h
index af2cfc5fe77d..dd9e3b8253ff 100644
--- a/fs/bcachefs/bcache.h
+++ b/fs/bcachefs/bcache.h
@@ -317,7 +317,8 @@ struct crypto_blkcipher;
struct crypto_ahash;
enum gc_phase {
- GC_PHASE_PENDING_DELETE = BTREE_ID_NR + 1,
+ GC_PHASE_SB_METADATA = BTREE_ID_NR + 1,
+ GC_PHASE_PENDING_DELETE,
GC_PHASE_DONE
};
@@ -340,21 +341,15 @@ struct bch_member_cpu {
u8 valid;
};
-struct bch_member_rcu {
- struct rcu_head rcu;
- unsigned nr_devices;
- struct bch_member_cpu m[];
-};
-
struct bch_dev {
+ struct kobject kobj;
struct percpu_ref ref;
- struct rcu_head free_rcu;
- struct work_struct free_work;
+ struct percpu_ref io_ref;
+ struct completion stop_complete;
+ struct completion offline_complete;
struct bch_fs *fs;
- struct dev_group self;
-
u8 dev_idx;
/*
* Cached version of this device's member info from superblock
@@ -362,10 +357,11 @@ struct bch_dev {
*/
struct bch_member_cpu mi;
uuid_le uuid;
+ char name[BDEVNAME_SIZE];
struct bcache_superblock disk_sb;
- struct kobject kobj;
+ struct dev_group self;
/* biosets used in cloned bios for replicas and moving_gc */
struct bio_set replica_set;
@@ -517,12 +513,6 @@ struct bch_fs {
struct bch_opts opts;
- /*
- * Cached copy in native endianness:
- * Set by bch_fs_mi_update():
- */
- struct bch_member_rcu __rcu *members;
-
/* Updated by bch_sb_update():*/
struct {
uuid_le uuid;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index ac3b8b458f44..f4c2f275bf78 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -788,7 +788,7 @@ LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
#endif
enum bch_member_state {
- BCH_MEMBER_STATE_ACTIVE = 0,
+ BCH_MEMBER_STATE_RW = 0,
BCH_MEMBER_STATE_RO = 1,
BCH_MEMBER_STATE_FAILED = 2,
BCH_MEMBER_STATE_SPARE = 3,
diff --git a/fs/bcachefs/blockdev.c b/fs/bcachefs/blockdev.c
index 5da771e1158c..a4522ad2836f 100644
--- a/fs/bcachefs/blockdev.c
+++ b/fs/bcachefs/blockdev.c
@@ -17,6 +17,8 @@
static int bch_blockdev_major;
static DEFINE_IDA(bch_blockdev_minor);
static LIST_HEAD(uncached_devices);
+static DEFINE_MUTEX(bch_blockdev_lock);
+
static struct kmem_cache *bch_search_cache;
static void write_bdev_super_endio(struct bio *bio)
@@ -62,21 +64,6 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
}
-bool bch_is_open_backing_dev(struct block_device *bdev)
-{
- struct bch_fs *c, *tc;
- struct cached_dev *dc, *t;
-
- list_for_each_entry_safe(c, tc, &bch_fs_list, list)
- list_for_each_entry_safe(dc, t, &c->cached_devs, list)
- if (dc->disk_sb.bdev == bdev)
- return true;
- list_for_each_entry_safe(dc, t, &uncached_devices, list)
- if (dc->disk_sb.bdev == bdev)
- return true;
- return false;
-}
-
static int open_dev(struct block_device *b, fmode_t mode)
{
struct bcache_device *d = b->bd_disk->private_data;
@@ -118,8 +105,6 @@ void bch_blockdev_stop(struct bcache_device *d)
static void bcache_device_unlink(struct bcache_device *d)
{
- lockdep_assert_held(&bch_register_lock);
-
if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
sysfs_remove_link(&d->c->kobj, d->name);
sysfs_remove_link(&d->kobj, "cache");
@@ -141,8 +126,6 @@ static void bcache_device_link(struct bcache_device *d, struct bch_fs *c,
static void bcache_device_detach(struct bcache_device *d)
{
- lockdep_assert_held(&bch_register_lock);
-
if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
mutex_lock(&d->inode_lock);
bch_inode_rm(d->c, bcache_dev_inum(d));
@@ -161,8 +144,6 @@ static int bcache_device_attach(struct bcache_device *d, struct bch_fs *c)
{
int ret;
- lockdep_assert_held(&bch_register_lock);
-
ret = radix_tree_insert(&c->devices, bcache_dev_inum(d), d);
if (ret) {
pr_err("radix_tree_insert() error for inum %llu",
@@ -178,8 +159,6 @@ static int bcache_device_attach(struct bcache_device *d, struct bch_fs *c)
static void bcache_device_free(struct bcache_device *d)
{
- lockdep_assert_held(&bch_register_lock);
-
pr_info("%s stopped", d->disk->disk_name);
if (d->c)
@@ -325,7 +304,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
BUG_ON(atomic_read(&dc->count));
- mutex_lock(&bch_register_lock);
+ mutex_lock(&bch_blockdev_lock);
memset(&dc->disk_sb.sb->set_uuid, 0, 16);
SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_NONE);
@@ -339,7 +318,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&bch_blockdev_lock);
pr_info("Caching disabled for %s", bdevname(dc->disk_sb.bdev, buf));
@@ -349,8 +328,6 @@ static void cached_dev_detach_finish(struct work_struct *w)
void bch_cached_dev_detach(struct cached_dev *dc)
{
- lockdep_assert_held(&bch_register_lock);
-
if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
return;
@@ -495,11 +472,14 @@ void bch_attach_backing_devs(struct bch_fs *c)
{
struct cached_dev *dc, *t;
- lockdep_assert_held(&bch_register_lock);
lockdep_assert_held(&c->state_lock);
+ mutex_lock(&bch_blockdev_lock);
+
list_for_each_entry_safe(dc, t, &uncached_devices, list)
bch_cached_dev_attach(dc, c);
+
+ mutex_unlock(&bch_blockdev_lock);
}
void bch_cached_dev_release(struct kobject *kobj)
@@ -517,14 +497,14 @@ static void cached_dev_free(struct closure *cl)
bch_cached_dev_writeback_stop(dc);
bch_cached_dev_writeback_free(dc);
- mutex_lock(&bch_register_lock);
+ mutex_lock(&bch_blockdev_lock);
if (atomic_read(&dc->running))
bd_unlink_disk_holder(dc->disk_sb.bdev, dc->disk.disk);
bcache_device_free(&dc->disk);
list_del(&dc->list);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&bch_blockdev_lock);
bch_free_super((void *) &dc->disk_sb);
@@ -536,11 +516,8 @@ static void cached_dev_flush(struct closure *cl)
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
struct bcache_device *d = &dc->disk;
- mutex_lock(&bch_register_lock);
- bcache_device_unlink(d);
- mutex_unlock(&bch_register_lock);
-
bch_cache_accounting_destroy(&dc->accounting);
+ bcache_device_unlink(d);
kobject_del(&d->kobj);
continue_at(cl, cached_dev_free, system_wq);
@@ -652,8 +629,11 @@ const char *bch_backing_dev_register(struct bcache_superblock *sb)
bdevname(dc->disk_sb.bdev, name));
list_add(&dc->list, &uncached_devices);
- list_for_each_entry(c, &bch_fs_list, list)
+ c = bch_uuid_to_fs(dc->disk_sb.sb->set_uuid);
+ if (c) {
bch_cached_dev_attach(dc, c);
+ closure_put(&c->cl);
+ }
if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE ||
BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE)
@@ -678,9 +658,7 @@ static void blockdev_volume_free(struct closure *cl)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
- mutex_lock(&bch_register_lock);
bcache_device_free(d);
- mutex_unlock(&bch_register_lock);
kobject_put(&d->kobj);
}
@@ -688,9 +666,7 @@ static void blockdev_volume_flush(struct closure *cl)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
- mutex_lock(&bch_register_lock);
bcache_device_unlink(d);
- mutex_unlock(&bch_register_lock);
kobject_del(&d->kobj);
continue_at(cl, blockdev_volume_free, system_wq);
}
@@ -792,7 +768,7 @@ void bch_blockdevs_stop(struct bch_fs *c)
struct radix_tree_iter iter;
void **slot;
- mutex_lock(&bch_register_lock);
+ mutex_lock(&bch_blockdev_lock);
rcu_read_lock();
radix_tree_for_each_slot(slot, &c->devices, &iter, 0) {
@@ -808,7 +784,7 @@ void bch_blockdevs_stop(struct bch_fs *c)
}
rcu_read_unlock();
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&bch_blockdev_lock);
}
void bch_fs_blockdev_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/blockdev.h b/fs/bcachefs/blockdev.h
index 0062ef7d1df3..5423d77644f8 100644
--- a/fs/bcachefs/blockdev.h
+++ b/fs/bcachefs/blockdev.h
@@ -59,7 +59,6 @@ void bch_cached_dev_detach(struct cached_dev *);
void bch_cached_dev_run(struct cached_dev *);
void bch_blockdev_stop(struct bcache_device *);
-bool bch_is_open_backing_dev(struct block_device *);
const char *bch_backing_dev_register(struct bcache_superblock *);
int bch_blockdev_volume_create(struct bch_fs *, u64);
@@ -90,10 +89,6 @@ static inline void bch_cached_dev_detach(struct cached_dev *dc) {}
static inline void bch_cached_dev_run(struct cached_dev *dc) {}
static inline void bch_blockdev_stop(struct bcache_device *d) {}
-static inline bool bch_is_open_backing_dev(struct block_device *bdev)
-{
- return false;
-}
static inline const char *bch_backing_dev_register(struct bcache_superblock *sb)
{
return "not implemented";
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 9c34269736c8..7e8a3f6a17df 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -90,15 +90,13 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
u8 bch_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
{
const struct bch_extent_ptr *ptr;
- struct bch_dev *ca;
u8 max_stale = 0;
if (bkey_extent_is_data(k.k)) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- rcu_read_lock();
-
- extent_for_each_online_device(c, e, ptr, ca) {
+ extent_for_each_ptr(e, ptr) {
+ struct bch_dev *ca = c->devs[ptr->dev];
size_t b = PTR_BUCKET_NR(ca, ptr);
if (__gen_after(ca->oldest_gens[b], ptr->gen))
@@ -106,8 +104,6 @@ u8 bch_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
max_stale = max(max_stale, ptr_stale(ca, ptr));
}
-
- rcu_read_unlock();
}
return max_stale;
@@ -254,10 +250,10 @@ static void bch_mark_allocator_buckets(struct bch_fs *c)
const struct bch_extent_ptr *ptr;
mutex_lock(&ob->lock);
- rcu_read_lock();
- open_bucket_for_each_online_device(c, ob, ptr, ca)
+ open_bucket_for_each_ptr(ob, ptr) {
+ ca = c->devs[ptr->dev];
bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), true);
- rcu_read_unlock();
+ }
mutex_unlock(&ob->lock);
}
}
@@ -273,7 +269,7 @@ static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end,
} while (b < end >> ca->bucket_bits);
}
-void bch_dev_mark_superblocks(struct bch_dev *ca)
+static void bch_dev_mark_superblocks(struct bch_dev *ca)
{
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
unsigned i;
@@ -294,11 +290,13 @@ void bch_dev_mark_superblocks(struct bch_dev *ca)
/*
* Mark non btree metadata - prios, journal
*/
-static void bch_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca)
+void bch_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca)
{
unsigned i;
u64 b;
+ lockdep_assert_held(&c->sb_lock);
+
bch_dev_mark_superblocks(ca);
spin_lock(&c->journal.lock);
@@ -329,10 +327,10 @@ static void bch_mark_metadata(struct bch_fs *c)
unsigned i;
mutex_lock(&c->sb_lock);
+ gc_pos_set(c, gc_phase(GC_PHASE_SB_METADATA));
- for_each_member_device(ca, c, i)
+ for_each_online_member(ca, c, i)
bch_mark_dev_metadata(c, ca);
-
mutex_unlock(&c->sb_lock);
}
@@ -935,14 +933,14 @@ int bch_initial_gc(struct bch_fs *c, struct list_head *journal)
{
enum btree_id id;
- bch_mark_metadata(c);
-
for (id = 0; id < BTREE_ID_NR; id++)
bch_initial_gc_btree(c, id);
if (journal)
bch_journal_mark(c, journal);
+ bch_mark_metadata(c);
+
/*
* Skip past versions that might have possibly been used (as nonces),
* but hadn't had their pointers written:
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 590ade2d8211..f1794fdf4378 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -13,7 +13,7 @@ int bch_initial_gc(struct bch_fs *, struct list_head *);
u8 bch_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
u8 bch_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
struct bkey_s_c);
-void bch_dev_mark_superblocks(struct bch_dev *);
+void bch_mark_dev_metadata(struct bch_fs *, struct bch_dev *);
/*
* For concurrent mark and sweep (with other index updates), we define a total
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index d11d72fc9f39..71478fb1cc89 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1226,7 +1226,7 @@ void bch_btree_node_read(struct bch_fs *c, struct btree *b)
bch_time_stats_update(&c->btree_read_time, start_time);
out:
bio_put(bio);
- percpu_ref_put(&pick.ca->ref);
+ percpu_ref_put(&pick.ca->io_ref);
}
int bch_btree_root_read(struct bch_fs *c, enum btree_id id,
@@ -1319,7 +1319,7 @@ static void btree_node_write_endio(struct bio *bio)
}
if (ca)
- percpu_ref_put(&ca->ref);
+ percpu_ref_put(&ca->io_ref);
}
void __bch_btree_node_write(struct bch_fs *c, struct btree *b,
@@ -1336,7 +1336,6 @@ void __bch_btree_node_write(struct bch_fs *c, struct btree *b,
BKEY_PADDED(key) k;
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
- struct bch_dev *ca;
struct sort_iter sort_iter;
struct nonce nonce;
unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
@@ -1557,10 +1556,9 @@ void __bch_btree_node_write(struct bch_fs *c, struct btree *b,
extent_for_each_ptr(e, ptr)
ptr->offset += b->written;
- rcu_read_lock();
- extent_for_each_online_device(c, e, ptr, ca)
- atomic64_add(sectors_to_write, &ca->btree_sectors_written);
- rcu_read_unlock();
+ extent_for_each_ptr(e, ptr)
+ atomic64_add(sectors_to_write,
+ &c->devs[ptr->dev]->btree_sectors_written);
b->written += sectors_to_write;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 59c68d493995..8514f5472016 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -450,8 +450,8 @@ static void bch_mark_pointer(struct bch_fs *c,
{
struct bucket_mark old, new;
unsigned saturated;
- struct bch_dev *ca;
- struct bucket *g;
+ struct bch_dev *ca = c->devs[ptr->dev];
+ struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
u64 v;
unsigned old_sectors, new_sectors;
int disk_sectors, compressed_sectors;
@@ -469,12 +469,6 @@ static void bch_mark_pointer(struct bch_fs *c,
compressed_sectors = -__compressed_sectors(crc, old_sectors)
+ __compressed_sectors(crc, new_sectors);
- ca = PTR_DEV(c, ptr);
- if (!ca)
- goto out;
-
- g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
-
if (gc_will_visit) {
if (journal_seq)
bucket_cmpxchg(g, new, new.journal_seq = journal_seq);
@@ -565,13 +559,11 @@ static void bch_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e,
BUG_ON(metadata && bkey_extent_is_cached(e.k));
BUG_ON(!sectors);
- rcu_read_lock();
extent_for_each_ptr_crc(e, ptr, crc)
bch_mark_pointer(c, e, crc, ptr, sectors,
ptr->cached ? S_CACHED : type,
may_make_unavailable,
stats, gc_will_visit, journal_seq);
- rcu_read_unlock();
}
static void __bch_mark_key(struct bch_fs *c, struct bkey_s_c k,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index d189c72fb8ad..9a00d38a682a 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -39,14 +39,6 @@ static inline u8 bucket_gc_gen(struct bch_dev *ca, struct bucket *g)
return g->mark.gen - ca->oldest_gens[r];
}
-static inline struct bch_dev *PTR_DEV(const struct bch_fs *c,
- const struct bch_extent_ptr *ptr)
-{
- EBUG_ON(ptr->dev > rcu_dereference(c->members)->nr_devices);
-
- return rcu_dereference(c->devs[ptr->dev]);
-}
-
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
@@ -64,14 +56,12 @@ static inline size_t PTR_BUCKET_NR_TRACE(const struct bch_fs *c,
#if 0
if (bkey_extent_is_data(&k->k)) {
const struct bch_extent_ptr *ptr;
- const struct bch_dev *ca;
- rcu_read_lock();
- extent_for_each_online_device(c, bkey_i_to_s_c_extent(k), ptr, ca) {
+ extent_for_each_ptr(bkey_i_to_s_c_extent(k), ptr) {
+ const struct bch_dev *ca = c->devs[ptr->dev];
bucket = PTR_BUCKET_NR(ca, ptr);
break;
}
- rcu_read_unlock();
}
#endif
return bucket;
@@ -102,8 +92,6 @@ static inline u8 gen_after(u8 a, u8 b)
/**
* ptr_stale() - check if a pointer points into a bucket that has been
* invalidated.
- *
- * Warning: PTR_DEV(c, k, ptr) must equal ca.
*/
static inline u8 ptr_stale(const struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 4b1fd946da32..9ef8cfc64d99 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -124,7 +124,7 @@ static long bch_ioctl_disk_add(struct bch_fs *c,
/* returns with ref on ca->ref */
static struct bch_dev *bch_device_lookup(struct bch_fs *c,
- const char __user *dev)
+ const char __user *dev)
{
struct block_device *bdev;
struct bch_dev *ca;
@@ -166,7 +166,6 @@ static long bch_ioctl_disk_remove(struct bch_fs *c,
ret = bch_dev_remove(c, ca, arg.flags);
- percpu_ref_put(&ca->ref);
return ret;
}
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 7bfe73c22109..b91f53d261d2 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -88,7 +88,7 @@ void __bch_btree_verify(struct bch_fs *c, struct btree *b)
bch_btree_node_read_done(c, v, pick.ca, &pick.ptr);
n_sorted = c->verify_data->data;
- percpu_ref_put(&pick.ca->ref);
+ percpu_ref_put(&pick.ca->io_ref);
sorted = &n_sorted->keys;
inmemory = &n_inmemory->keys;
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 48087fba967c..ba46d2d12f59 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -112,7 +112,6 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
struct bch_fs *c = ca->fs;
unsigned errors = atomic_read(&ca->io_errors);
- char buf[BDEVNAME_SIZE];
bool dev;
if (errors < c->error_limit) {
@@ -127,9 +126,8 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
? __bch_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
BCH_FORCE_IF_DEGRADED)
: bch_fs_emergency_read_only(c))
- bch_err(c,
- "too many IO errors on %s, setting %s RO",
- bdevname(ca->disk_sb.bdev, buf),
+ bch_err(ca,
+ "too many IO errors, setting %s RO",
dev ? "device" : "filesystem");
mutex_unlock(&c->state_lock);
}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index fe8e186ada1a..726b20d4434b 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -13,13 +13,6 @@ struct bch_fs;
/* Error messages: */
-#define __bch_dev_error(ca, fmt, ...) \
-do { \
- char _buf[BDEVNAME_SIZE]; \
- bch_err((ca)->fs, "%s: " fmt, \
- bdevname((ca)->disk_sb.bdev, _buf), ##__VA_ARGS__); \
-} while (0)
-
/*
* Very fatal logic/inconsistency errors: these indicate that we've majorly
* screwed up at runtime, i.e. it's not likely that it was just caused by the
@@ -75,7 +68,7 @@ do { \
#define bch_dev_inconsistent(ca, ...) \
do { \
- __bch_dev_error(ca, __VA_ARGS__); \
+ bch_err(ca, __VA_ARGS__); \
bch_inconsistent_error((ca)->fs); \
} while (0)
@@ -171,17 +164,15 @@ do { \
#define bch_dev_fatal_error(ca, ...) \
do { \
- __bch_dev_error(ca, __VA_ARGS__); \
+ bch_err(ca, __VA_ARGS__); \
bch_fatal_error(c); \
} while (0)
#define bch_dev_fatal_io_error(ca, fmt, ...) \
do { \
- char _buf[BDEVNAME_SIZE]; \
- \
printk_ratelimited(KERN_ERR bch_fmt((ca)->fs, \
"fatal IO error on %s for " fmt), \
- bdevname((ca)->disk_sb.bdev, _buf), ##__VA_ARGS__); \
+ (ca)->name, ##__VA_ARGS__); \
bch_fatal_error((ca)->fs); \
} while (0)
@@ -219,11 +210,9 @@ do { \
/* Logs message and handles the error: */
#define bch_dev_nonfatal_io_error(ca, fmt, ...) \
do { \
- char _buf[BDEVNAME_SIZE]; \
- \
printk_ratelimited(KERN_ERR bch_fmt((ca)->fs, \
"IO error on %s for " fmt), \
- bdevname((ca)->disk_sb.bdev, _buf), ##__VA_ARGS__); \
+ (ca)->name, ##__VA_ARGS__); \
bch_nonfatal_io_error(ca); \
} while (0)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 731dce2ec7d5..87a68d738567 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -322,9 +322,9 @@ static bool should_drop_ptr(const struct bch_fs *c,
struct bkey_s_c_extent e,
const struct bch_extent_ptr *ptr)
{
- struct bch_dev *ca;
+ struct bch_dev *ca = c->devs[ptr->dev];
- return (ca = PTR_DEV(c, ptr)) && ptr_stale(ca, ptr);
+ return ptr_stale(ca, ptr);
}
static void bch_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
@@ -332,14 +332,12 @@ static void bch_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
struct bch_extent_ptr *ptr = &e.v->start->ptr;
bool dropped = false;
- rcu_read_lock();
while ((ptr = extent_ptr_next(e, ptr)))
if (should_drop_ptr(c, e.c, ptr)) {
__bch_extent_drop_ptr(e, ptr);
dropped = true;
} else
ptr++;
- rcu_read_unlock();
if (dropped)
bch_extent_drop_redundant_crcs(e);
@@ -387,30 +385,39 @@ static void bch_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
}
}
-static const char *extent_ptr_invalid(struct bkey_s_c_extent e,
- const struct bch_member_rcu *mi,
+static const char *extent_ptr_invalid(const struct bch_fs *c,
+ struct bkey_s_c_extent e,
const struct bch_extent_ptr *ptr,
- unsigned size_ondisk)
+ unsigned size_ondisk,
+ bool metadata)
{
const struct bch_extent_ptr *ptr2;
- const struct bch_member_cpu *m = mi->m + ptr->dev;
+ struct bch_dev *ca;
+
+ if (ptr->dev >= c->sb.nr_devices)
+ return "pointer to invalid device";
- if (ptr->dev > mi->nr_devices || !m->valid)
+ ca = c->devs[ptr->dev];
+ if (!ca)
return "pointer to invalid device";
extent_for_each_ptr(e, ptr2)
if (ptr != ptr2 && ptr->dev == ptr2->dev)
return "multiple pointers to same device";
- if (ptr->offset + size_ondisk > m->bucket_size * m->nbuckets)
+ if (ptr->offset + size_ondisk > ca->mi.bucket_size * ca->mi.nbuckets)
return "offset past end of device";
- if (ptr->offset < m->bucket_size * m->first_bucket)
+ if (ptr->offset < ca->mi.bucket_size * ca->mi.first_bucket)
return "offset before first bucket";
- if ((ptr->offset & (m->bucket_size - 1)) + size_ondisk > m->bucket_size)
+ if ((ptr->offset & (ca->mi.bucket_size - 1)) +
+ size_ondisk > ca->mi.bucket_size)
return "spans multiple buckets";
+ if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data))
+ return "device not marked as containing data";
+
return NULL;
}
@@ -426,7 +433,6 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
- rcu_read_lock();
extent_for_each_entry(e, entry) {
if (!first)
p(" ");
@@ -445,10 +451,11 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
break;
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
+ ca = c->devs[ptr->dev];
p("ptr: %u:%llu gen %u%s", ptr->dev,
(u64) ptr->offset, ptr->gen,
- (ca = PTR_DEV(c, ptr)) && ptr_stale(ca, ptr)
+ ca && ptr_stale(ca, ptr)
? " stale" : "");
break;
default:
@@ -459,8 +466,6 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
first = false;
}
out:
- rcu_read_unlock();
-
if (bkey_extent_is_cached(e.k))
p(" cached");
#undef p
@@ -487,27 +492,20 @@ static const char *bch_btree_ptr_invalid(const struct bch_fs *c,
const union bch_extent_entry *entry;
const struct bch_extent_ptr *ptr;
const union bch_extent_crc *crc;
- struct bch_member_rcu *mi;
const char *reason;
extent_for_each_entry(e, entry)
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
return "invalid extent entry type";
- mi = fs_member_info_get(c);
-
extent_for_each_ptr_crc(e, ptr, crc) {
- reason = extent_ptr_invalid(e, mi, ptr,
- c->sb.btree_node_size);
-
- if (reason) {
- fs_member_info_put();
+ reason = extent_ptr_invalid(c, e, ptr,
+ c->sb.btree_node_size,
+ true);
+ if (reason)
return reason;
- }
}
- fs_member_info_put();
-
if (crc)
return "has crc field";
@@ -532,32 +530,26 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
unsigned replicas = 0;
bool bad;
- rcu_read_lock();
-
- extent_for_each_online_device(c, e, ptr, ca) {
+ extent_for_each_ptr(e, ptr) {
+ ca = c->devs[ptr->dev];
+ g = PTR_BUCKET(ca, ptr);
replicas++;
- if ((ca = PTR_DEV(c, ptr))) {
- g = PTR_BUCKET(ca, ptr);
+ err = "stale";
+ if (ptr_stale(ca, ptr))
+ goto err;
- err = "stale";
- if (ptr_stale(ca, ptr))
- goto err;
-
- do {
- seq = read_seqcount_begin(&c->gc_pos_lock);
- bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
- g->mark.data_type != BUCKET_BTREE;
- } while (read_seqcount_retry(&c->gc_pos_lock, seq));
+ do {
+ seq = read_seqcount_begin(&c->gc_pos_lock);
+ bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
+ g->mark.data_type != BUCKET_BTREE;
+ } while (read_seqcount_retry(&c->gc_pos_lock, seq));
- err = "inconsistent";
- if (bad)
- goto err;
- }
+ err = "inconsistent";
+ if (bad)
+ goto err;
}
- rcu_read_unlock();
-
if (replicas < c->sb.meta_replicas_have) {
bch_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), k);
@@ -576,7 +568,6 @@ err:
g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
(unsigned) g->mark.counter);
- rcu_read_unlock();
}
static void bch_btree_ptr_to_text(struct bch_fs *c, char *buf,
@@ -603,11 +594,9 @@ bch_btree_pick_ptr(struct bch_fs *c, const struct btree *b)
const union bch_extent_crc *crc;
const struct bch_extent_ptr *ptr;
struct extent_pick_ptr pick = { .ca = NULL };
- struct bch_dev *ca;
-
- rcu_read_lock();
- extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+ extent_for_each_ptr_crc(e, ptr, crc) {
+ struct bch_dev *ca = c->devs[ptr->dev];
struct btree *root = btree_node_root(c, b);
if (bch_fs_inconsistent_on(crc, c,
@@ -628,15 +617,16 @@ bch_btree_pick_ptr(struct bch_fs *c, const struct btree *b)
if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
continue;
+ if (!percpu_ref_tryget(&ca->io_ref))
+ continue;
+
+ if (pick.ca)
+ percpu_ref_put(&pick.ca->io_ref);
+
pick.ca = ca;
pick.ptr = *ptr;
}
- if (pick.ca)
- percpu_ref_get(&pick.ca->ref);
-
- rcu_read_unlock();
-
return pick;
}
@@ -1757,47 +1747,38 @@ static const char *bch_extent_invalid(const struct bch_fs *c,
const union bch_extent_entry *entry;
const union bch_extent_crc *crc;
const struct bch_extent_ptr *ptr;
- struct bch_member_rcu *mi = fs_member_info_get(c);
unsigned size_ondisk = e.k->size;
const char *reason;
extent_for_each_entry(e, entry) {
- reason = "invalid extent entry type";
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
- goto invalid;
+ return "invalid extent entry type";
if (extent_entry_is_crc(entry)) {
crc = entry_to_crc(entry);
- reason = "checksum offset + key size > uncompressed size";
if (crc_offset(crc) + e.k->size >
crc_uncompressed_size(e.k, crc))
- goto invalid;
+ return "checksum offset + key size > uncompressed size";
size_ondisk = crc_compressed_size(e.k, crc);
- reason = "invalid checksum type";
if (!bch_checksum_type_valid(c, crc_csum_type(crc)))
- goto invalid;
+ return "invalid checksum type";
- reason = "invalid compression type";
if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
- goto invalid;
+ return "invalid compression type";
} else {
ptr = entry_to_ptr(entry);
- reason = extent_ptr_invalid(e, mi,
- &entry->ptr, size_ondisk);
+ reason = extent_ptr_invalid(c, e, &entry->ptr,
+ size_ondisk, false);
if (reason)
- goto invalid;
+ return reason;
}
}
- fs_member_info_put();
return NULL;
-invalid:
- fs_member_info_put();
- return reason;
}
case BCH_RESERVATION: {
@@ -1821,14 +1802,13 @@ static void bch_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
struct bkey_s_c_extent e)
{
const struct bch_extent_ptr *ptr;
- struct bch_member_rcu *mi;
struct bch_dev *ca;
struct bucket *g;
unsigned seq, stale;
char buf[160];
bool bad;
unsigned ptrs_per_tier[BCH_TIER_MAX];
- unsigned tier, replicas = 0;
+ unsigned replicas = 0;
/*
* XXX: we should be doing most/all of these checks at startup time,
@@ -1841,13 +1821,11 @@ static void bch_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
- mi = fs_member_info_get(c);
-
extent_for_each_ptr(e, ptr) {
+ ca = c->devs[ptr->dev];
+ g = PTR_BUCKET(ca, ptr);
replicas++;
-
- if (ptr->dev >= mi->nr_devices)
- goto bad_device;
+ ptrs_per_tier[ca->mi.tier]++;
/*
* If journal replay hasn't finished, we might be seeing keys
@@ -1856,51 +1834,40 @@ static void bch_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
continue;
- if (!mi->m[ptr->dev].valid)
- goto bad_device;
-
- tier = mi->m[ptr->dev].tier;
- ptrs_per_tier[tier]++;
-
stale = 0;
- if ((ca = PTR_DEV(c, ptr))) {
- g = PTR_BUCKET(ca, ptr);
-
- do {
- struct bucket_mark mark;
+ do {
+ struct bucket_mark mark;
- seq = read_seqcount_begin(&c->gc_pos_lock);
- mark = READ_ONCE(g->mark);
+ seq = read_seqcount_begin(&c->gc_pos_lock);
+ mark = READ_ONCE(g->mark);
- /* between mark and bucket gen */
- smp_rmb();
+ /* between mark and bucket gen */
+ smp_rmb();
- stale = ptr_stale(ca, ptr);
+ stale = ptr_stale(ca, ptr);
- bch_fs_bug_on(stale && !ptr->cached, c,
- "stale dirty pointer");
+ bch_fs_bug_on(stale && !ptr->cached, c,
+ "stale dirty pointer");
- bch_fs_bug_on(stale > 96, c,
- "key too stale: %i",
- stale);
+ bch_fs_bug_on(stale > 96, c,
+ "key too stale: %i",
+ stale);
- if (stale)
- break;
+ if (stale)
+ break;
- bad = (mark.data_type != BUCKET_DATA ||
- (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
- !mark.owned_by_allocator &&
- !(ptr->cached
- ? mark.cached_sectors
- : mark.dirty_sectors)));
- } while (read_seqcount_retry(&c->gc_pos_lock, seq));
+ bad = (mark.data_type != BUCKET_DATA ||
+ (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
+ !mark.owned_by_allocator &&
+ !(ptr->cached
+ ? mark.cached_sectors
+ : mark.dirty_sectors)));
+ } while (read_seqcount_retry(&c->gc_pos_lock, seq));
- if (bad)
- goto bad_ptr;
- }
+ if (bad)
+ goto bad_ptr;
}
- fs_member_info_put();
if (replicas > BCH_REPLICAS_MAX) {
bch_bkey_val_to_text(c, btree_node_type(b), buf,
@@ -1923,14 +1890,6 @@ static void bch_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
return;
-bad_device:
- bch_bkey_val_to_text(c, btree_node_type(b), buf,
- sizeof(buf), e.s_c);
- bch_fs_bug(c, "extent pointer to dev %u missing device: %s",
- ptr->dev, buf);
- fs_member_info_put();
- return;
-
bad_ptr:
bch_bkey_val_to_text(c, btree_node_type(b), buf,
sizeof(buf), e.s_c);
@@ -1940,7 +1899,6 @@ bad_ptr:
g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
(unsigned) g->mark.counter);
- fs_member_info_put();
return;
}
@@ -1976,12 +1934,10 @@ static void bch_extent_to_text(struct bch_fs *c, char *buf,
#undef p
}
-static unsigned PTR_TIER(struct bch_member_rcu *mi,
+static unsigned PTR_TIER(struct bch_fs *c,
const struct bch_extent_ptr *ptr)
{
- return ptr->dev < mi->nr_devices
- ? mi->m[ptr->dev].tier
- : UINT_MAX;
+ return c->devs[ptr->dev]->mi.tier;
}
static void bch_extent_crc_init(union bch_extent_crc *crc,
@@ -2136,35 +2092,30 @@ void bch_extent_mark_replicas_cached(struct bch_fs *c,
unsigned nr_cached)
{
struct bch_extent_ptr *ptr;
- struct bch_member_rcu *mi;
bool have_higher_tier;
unsigned tier = 0;
if (!nr_cached)
return;
- mi = fs_member_info_get(c);
-
do {
have_higher_tier = false;
extent_for_each_ptr(e, ptr) {
if (!ptr->cached &&
- PTR_TIER(mi, ptr) == tier) {
+ PTR_TIER(c, ptr) == tier) {
ptr->cached = true;
nr_cached--;
if (!nr_cached)
- goto out;
+ return;
}
- if (PTR_TIER(mi, ptr) > tier)
+ if (PTR_TIER(c, ptr) > tier)
have_higher_tier = true;
}
tier++;
} while (have_higher_tier);
-out:
- fs_member_info_put();
}
/*
@@ -2182,7 +2133,6 @@ void bch_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
struct bkey_s_c_extent e;
const union bch_extent_crc *crc;
const struct bch_extent_ptr *ptr;
- struct bch_dev *ca;
switch (k.k->type) {
case KEY_TYPE_DELETED:
@@ -2198,10 +2148,11 @@ void bch_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
e = bkey_s_c_to_extent(k);
- rcu_read_lock();
ret->ca = NULL;
- extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+ extent_for_each_ptr_crc(e, ptr, crc) {
+ struct bch_dev *ca = c->devs[ptr->dev];
+
if (ptr_stale(ca, ptr))
continue;
@@ -2213,6 +2164,12 @@ void bch_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
ret->ca->mi.tier < ca->mi.tier))
continue;
+ if (!percpu_ref_tryget(&ca->io_ref))
+ continue;
+
+ if (ret->ca)
+ percpu_ref_put(&ret->ca->io_ref);
+
*ret = (struct extent_pick_ptr) {
.crc = crc_to_128(e.k, crc),
.ptr = *ptr,
@@ -2220,12 +2177,8 @@ void bch_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
};
}
- if (ret->ca)
- percpu_ref_get(&ret->ca->ref);
- else if (!bkey_extent_is_cached(e.k))
+ if (!ret->ca && !bkey_extent_is_cached(e.k))
ret->ca = ERR_PTR(-EIO);
-
- rcu_read_unlock();
return;
case BCH_RESERVATION:
@@ -2273,7 +2226,7 @@ static enum merge_result bch_extent_merge(struct bch_fs *c,
extent_for_each_entry(el, en_l) {
struct bch_extent_ptr *lp, *rp;
- struct bch_member_cpu *m;
+ unsigned bucket_size;
en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
@@ -2291,15 +2244,11 @@ static enum merge_result bch_extent_merge(struct bch_fs *c,
return BCH_MERGE_NOMERGE;
/* We don't allow extents to straddle buckets: */
+ bucket_size = c->devs[lp->dev]->mi.bucket_size;
- m = fs_member_info_get(c)->m + lp->dev;
- if ((lp->offset & ~((u64) m->bucket_size - 1)) !=
- (rp->offset & ~((u64) m->bucket_size - 1))) {
- fs_member_info_put();
+ if ((lp->offset & ~((u64) bucket_size - 1)) !=
+ (rp->offset & ~((u64) bucket_size - 1)))
return BCH_MERGE_NOMERGE;
-
- }
- fs_member_info_put();
}
break;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 2d70c42a695b..db7bd4f14988 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -285,10 +285,6 @@ out: \
#define extent_for_each_ptr_crc(_e, _ptr, _crc) \
extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true)
-#define extent_for_each_online_device_crc(_c, _e, _crc, _ptr, _ca) \
- extent_for_each_ptr_crc_filter(_e, _ptr, _crc, \
- ((_ca) = PTR_DEV(_c, _ptr)))
-
/* Iterate over pointers only, and from a given position: */
#define extent_ptr_next_filter(_e, _ptr, _filter) \
@@ -309,9 +305,6 @@ out: \
#define extent_for_each_ptr(_e, _ptr) \
extent_for_each_ptr_filter(_e, _ptr, true)
-#define extent_for_each_online_device(_c, _e, _ptr, _ca) \
- extent_for_each_ptr_filter(_e, _ptr, ((_ca) = PTR_DEV(_c, _ptr)))
-
#define extent_ptr_prev(_e, _ptr) \
({ \
typeof(&(_e).v->start->ptr) _p; \
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b0dc1c142c58..f1125a32239f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1183,33 +1183,13 @@ static int bch_sync_fs(struct super_block *sb, int wait)
return bch_journal_flush(&c->journal);
}
-static struct bch_fs *bch_bdev_to_fs(struct block_device *bdev)
-{
- struct bch_fs *c;
- struct bch_dev *ca;
- unsigned i;
-
- rcu_read_lock();
-
- list_for_each_entry(c, &bch_fs_list, list)
- for_each_member_device_rcu(ca, c, i)
- if (ca->disk_sb.bdev == bdev) {
- rcu_read_unlock();
- return c;
- }
-
- rcu_read_unlock();
-
- return NULL;
-}
-
static struct bch_fs *bch_open_as_blockdevs(const char *_dev_name,
- struct bch_opts opts)
+ struct bch_opts opts)
{
size_t nr_devs = 0, i = 0;
char *dev_name, *s, **devs;
struct bch_fs *c = NULL;
- const char *err;
+ const char *err = "cannot allocate memory";
dev_name = kstrdup(_dev_name, GFP_KERNEL);
if (!dev_name)
@@ -1235,40 +1215,40 @@ static struct bch_fs *bch_open_as_blockdevs(const char *_dev_name,
* filesystem and they all belong to the _same_ filesystem
*/
- mutex_lock(&bch_register_lock);
-
for (i = 0; i < nr_devs; i++) {
struct block_device *bdev = lookup_bdev(devs[i]);
struct bch_fs *c2;
if (IS_ERR(bdev))
- goto err_unlock;
+ goto err;
c2 = bch_bdev_to_fs(bdev);
bdput(bdev);
if (!c)
c = c2;
+ else if (c2)
+ closure_put(&c2->cl);
- if (c != c2)
- goto err_unlock;
+ if (!c)
+ goto err;
+ if (c != c2) {
+ closure_put(&c->cl);
+ goto err;
+ }
}
- if (!c)
- goto err_unlock;
-
mutex_lock(&c->state_lock);
if (!bch_fs_running(c)) {
mutex_unlock(&c->state_lock);
+ closure_put(&c->cl);
err = "incomplete filesystem";
c = NULL;
- goto err_unlock;
+ goto err;
}
- closure_get(&c->cl);
mutex_unlock(&c->state_lock);
- mutex_unlock(&bch_register_lock);
}
set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
@@ -1276,11 +1256,9 @@ err:
kfree(devs);
kfree(dev_name);
+ if (!c)
+ pr_err("bch_fs_open err %s", err);
return c;
-err_unlock:
- mutex_unlock(&bch_register_lock);
- pr_err("bch_fs_open err %s", err);
- goto err;
}
static int bch_remount(struct super_block *sb, int *flags, char *data)
@@ -1398,21 +1376,17 @@ static struct dentry *bch_mount(struct file_system_type *fs_type,
sb->s_time_gran = c->sb.time_precision;
c->vfs_sb = sb;
sb->s_bdi = &c->bdi;
+ strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i) {
+ for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
- BUILD_BUG_ON(sizeof(sb->s_id) < BDEVNAME_SIZE);
-
- bdevname(bdev, sb->s_id);
-
- /* XXX: do we even need s_bdev? */
+ /* XXX: create an anonymous device for multi device filesystems */
sb->s_bdev = bdev;
sb->s_dev = bdev->bd_dev;
+ percpu_ref_put(&ca->io_ref);
break;
}
- rcu_read_unlock();
if (opts.posix_acl < 0)
sb->s_flags |= MS_POSIXACL;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 9a2f9c1c683b..fbcc40427f23 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -146,14 +146,9 @@ void bch_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
wbio->c = c;
extent_for_each_ptr(e, ptr) {
- rcu_read_lock();
- ca = PTR_DEV(c, ptr);
- if (ca)
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
-
- if (!ca) {
- bch_submit_wbio(c, wbio, ca, ptr, punt);
+ ca = c->devs[ptr->dev];
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ bch_submit_wbio(c, wbio, NULL, ptr, punt);
break;
}
@@ -365,7 +360,7 @@ static void bch_write_endio(struct bio *bio)
bch_account_io_completion_time(ca, wbio->submit_time_us,
REQ_OP_WRITE);
if (ca)
- percpu_ref_put(&ca->ref);
+ percpu_ref_put(&ca->io_ref);
if (bio->bi_error && orig)
orig->bi_error = bio->bi_error;
@@ -992,7 +987,7 @@ static void bch_rbio_done(struct bch_fs *c, struct bch_read_bio *rbio)
{
struct bio *orig = &bch_rbio_parent(rbio)->bio;
- percpu_ref_put(&rbio->ca->ref);
+ percpu_ref_put(&rbio->ca->io_ref);
rbio->ca = NULL;
if (rbio->split) {
@@ -1034,7 +1029,7 @@ static void bch_read_error_maybe_retry(struct bch_fs *c,
bch_rbio_done(c, rbio);
return;
retry:
- percpu_ref_put(&rbio->ca->ref);
+ percpu_ref_put(&rbio->ca->io_ref);
rbio->ca = NULL;
spin_lock_irqsave(&c->read_retry_lock, flags);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 01c2b92f064f..109c27c88be5 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -897,6 +897,7 @@ search_done:
break;
out:
free_pages((unsigned long) buf.data, get_order(buf.size));
+ percpu_ref_put(&ca->io_ref);
closure_return(cl);
err:
mutex_lock(&jlist->lock);
@@ -974,11 +975,13 @@ int bch_journal_read(struct bch_fs *c, struct list_head *list)
jlist.head = list;
jlist.ret = 0;
- for_each_member_device(ca, c, iter)
+ for_each_readable_member(ca, c, iter) {
+ percpu_ref_get(&ca->io_ref);
closure_call(&ca->journal.read,
bch_journal_read_device,
system_unbound_wq,
&jlist.cl);
+ }
closure_sync(&jlist.cl);
@@ -1285,8 +1288,8 @@ static int journal_entry_sectors(struct journal *j)
lockdep_assert_held(&j->lock);
- rcu_read_lock();
- group_for_each_dev_rcu(ca, &j->devs, i) {
+ spin_lock(&j->devs.lock);
+ group_for_each_dev(ca, &j->devs, i) {
unsigned buckets_required = 0;
sectors_available = min_t(unsigned, sectors_available,
@@ -1317,7 +1320,7 @@ static int journal_entry_sectors(struct journal *j)
nr_devs++;
nr_online++;
}
- rcu_read_unlock();
+ spin_unlock(&j->devs.lock);
if (nr_online < c->opts.metadata_replicas_required)
return -EROFS;
@@ -1881,8 +1884,9 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
bool ret;
spin_lock(&j->lock);
- ret = (ja->last_idx != ja->cur_idx &&
- ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
+ ret = ja->nr &&
+ (ja->last_idx != ja->cur_idx &&
+ ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
spin_unlock(&j->lock);
return ret;
@@ -1922,9 +1926,12 @@ static void journal_reclaim_work(struct work_struct *work)
* Advance last_idx to point to the oldest journal entry containing
* btree node updates that have not yet been written out
*/
- group_for_each_dev(ca, &j->devs, iter) {
+ for_each_rw_member(ca, c, iter) {
struct journal_device *ja = &ca->journal;
+ if (!ja->nr)
+ continue;
+
while (should_discard_bucket(j, ja)) {
if (!reclaim_lock_held) {
/*
@@ -2012,7 +2019,6 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
READ_ONCE(c->opts.metadata_replicas);
spin_lock(&j->lock);
- rcu_read_lock();
/*
* Drop any pointers to devices that have been removed, are no longer
@@ -2023,13 +2029,15 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
* entry - that's why we drop pointers to devices <= current free space,
* i.e. whichever device was limiting the current journal entry size.
*/
- extent_for_each_ptr_backwards(e, ptr)
- if (!(ca = PTR_DEV(c, ptr)) ||
- ca->mi.state != BCH_MEMBER_STATE_ACTIVE ||
+ extent_for_each_ptr_backwards(e, ptr) {
+ ca = c->devs[ptr->dev];
+
+ if (ca->mi.state != BCH_MEMBER_STATE_RW ||
ca->journal.sectors_free <= sectors)
__bch_extent_drop_ptr(e, ptr);
else
ca->journal.sectors_free -= sectors;
+ }
replicas = bch_extent_nr_ptrs(e.c);
@@ -2051,8 +2059,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
* Pick devices for next journal write:
* XXX: sort devices by free journal space?
*/
- for (i = 0; i < j->devs.nr; i++) {
- ca = j->devs.d[i].dev;
+ group_for_each_dev(ca, &j->devs, i) {
ja = &ca->journal;
if (replicas >= replicas_want)
@@ -2082,7 +2089,6 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
}
spin_unlock(&j->devs.lock);
- rcu_read_unlock();
j->prev_buf_sectors = 0;
spin_unlock(&j->lock);
@@ -2148,7 +2154,7 @@ static void journal_write_endio(struct bio *bio)
bch_journal_halt(j);
closure_put(&j->io);
- percpu_ref_put(&ca->ref);
+ percpu_ref_put(&ca->io_ref);
}
static void journal_write_done(struct closure *cl)
@@ -2253,13 +2259,8 @@ static void journal_write(struct closure *cl)
goto no_io;
extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) {
- rcu_read_lock();
- ca = PTR_DEV(c, ptr);
- if (ca)
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
-
- if (!ca) {
+ ca = c->devs[ptr->dev];
+ if (!percpu_ref_tryget(&ca->io_ref)) {
/* XXX: fix this */
bch_err(c, "missing device for journal write\n");
continue;
@@ -2284,11 +2285,10 @@ static void journal_write(struct closure *cl)
ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
}
- for_each_member_device(ca, c, i)
- if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
- journal_flushes_device(ca) &&
+ for_each_rw_member(ca, c, i)
+ if (journal_flushes_device(ca) &&
!bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) {
- percpu_ref_get(&ca->ref);
+ percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
bio_reset(bio);
@@ -2631,7 +2631,8 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf)
journal_entry_is_open(j),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
- group_for_each_dev_rcu(ca, &j->devs, iter) {
+ spin_lock(&j->devs.lock);
+ group_for_each_dev(ca, &j->devs, iter) {
struct journal_device *ja = &ca->journal;
ret += scnprintf(buf + ret, PAGE_SIZE - ret,
@@ -2643,6 +2644,7 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf)
ja->cur_idx, ja->bucket_seq[ja->cur_idx],
ja->last_idx, ja->bucket_seq[ja->last_idx]);
}
+ spin_unlock(&j->devs.lock);
spin_unlock(&j->lock);
rcu_read_unlock();
@@ -2748,19 +2750,24 @@ void bch_fs_journal_stop(struct journal *j)
void bch_dev_journal_exit(struct bch_dev *ca)
{
+ kfree(ca->journal.bio);
kfree(ca->journal.buckets);
kfree(ca->journal.bucket_seq);
+
+ ca->journal.bio = NULL;
+ ca->journal.buckets = NULL;
+ ca->journal.bucket_seq = NULL;
}
-int bch_dev_journal_init(struct bch_dev *ca)
+int bch_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
{
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
- bch_sb_get_journal(ca->disk_sb.sb);
+ bch_sb_get_journal(sb);
unsigned i, journal_entry_pages;
journal_entry_pages =
- DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb),
+ DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb),
PAGE_SECTORS);
ja->nr = bch_nr_journal_buckets(journal_buckets);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 96f0b764837b..c83f81046f47 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -366,7 +366,7 @@ int bch_journal_move(struct bch_dev *);
void bch_fs_journal_stop(struct journal *);
void bch_dev_journal_exit(struct bch_dev *);
-int bch_dev_journal_init(struct bch_dev *);
+int bch_dev_journal_init(struct bch_dev *, struct bch_sb *);
void bch_fs_journal_exit(struct journal *);
int bch_fs_journal_init(struct journal *, unsigned);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index ce6defe5bda1..5bd93be2fddf 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -64,7 +64,7 @@ int bch_move_data_off_device(struct bch_dev *ca)
u64 seen_key_count;
int ret = 0;
- BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE);
+ BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
if (!ca->mi.has_data)
return 0;
@@ -163,7 +163,7 @@ static int bch_move_btree_off(struct bch_dev *ca, enum btree_id id)
struct btree *b;
int ret;
- BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE);
+ BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
closure_init_stack(&cl);
@@ -259,7 +259,7 @@ int bch_move_metadata_off_device(struct bch_dev *ca)
unsigned i;
int ret;
- BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE);
+ BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
if (!ca->mi.has_metadata)
return 0;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 25b203a142ce..a9a9d3197b6d 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -17,12 +17,7 @@ static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c,
struct bch_extent_ptr ptr)
{
struct bch_extent_ptr *ptr2;
- struct bch_member_rcu *mi;
- unsigned bucket_bits;
-
- mi = fs_member_info_get(c);
- bucket_bits = ilog2(mi->m[ptr.dev].bucket_size);
- fs_member_info_put();
+ unsigned bucket_bits = c->devs[ptr.dev]->bucket_bits;
extent_for_each_ptr(e, ptr2)
if (ptr2->dev == ptr.dev &&
diff --git a/fs/bcachefs/notify.c b/fs/bcachefs/notify.c
index 675dc26cd9ef..1d5f626fcf5d 100644
--- a/fs/bcachefs/notify.c
+++ b/fs/bcachefs/notify.c
@@ -31,11 +31,10 @@ static void notify_get(struct bch_fs *c)
static void notify_get_cache(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- char buf[BDEVNAME_SIZE];
notify_get(c);
notify_var(c, "UUID=%pU", ca->uuid.b);
- notify_var(c, "BLOCKDEV=%s", bdevname(ca->disk_sb.bdev, buf));
+ notify_var(c, "BLOCKDEV=%s", ca->name);
}
static void notify_put(struct bch_fs *c)
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 23302d44a8bb..41780d594af1 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -50,7 +50,7 @@ const char * const bch_cache_modes[] = {
};
const char * const bch_dev_state[] = {
- "active",
+ "readwrite",
"readonly",
"failed",
"spare",
diff --git a/fs/bcachefs/request.c b/fs/bcachefs/request.c
index 2b9e687e742b..0646346e4667 100644
--- a/fs/bcachefs/request.c
+++ b/fs/bcachefs/request.c
@@ -712,14 +712,7 @@ static int cached_dev_congested(void *data, int bits)
return 1;
if (cached_dev_get(dc)) {
- unsigned i;
- struct bch_dev *ca;
-
- for_each_member_device(ca, d->c, i) {
- q = bdev_get_queue(ca->disk_sb.bdev);
- ret |= bdi_congested(&q->backing_dev_info, bits);
- }
-
+ ret |= bch_congested(d->c, bits);
cached_dev_put(dc);
}
@@ -802,17 +795,8 @@ static int blockdev_volume_ioctl(struct bcache_device *d, fmode_t mode,
static int blockdev_volume_congested(void *data, int bits)
{
struct bcache_device *d = data;
- struct request_queue *q;
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
- for_each_member_device(ca, d->c, i) {
- q = bdev_get_queue(ca->disk_sb.bdev);
- ret |= bdi_congested(&q->backing_dev_info, bits);
- }
-
- return ret;
+ return bch_congested(d->c, bits);
}
void bch_blockdev_volume_request_init(struct bcache_device *d)
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index fd635e64f096..67c03e1932b1 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -174,7 +174,9 @@ struct bch_sb_field *bch_fs_sb_field_resize(struct bch_fs *c,
if (bch_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d))
return NULL;
- for_each_member_device(ca, c, i) {
+ /* XXX: we're not checking that offline device have enough space */
+
+ for_each_online_member(ca, c, i) {
struct bcache_superblock *sb = &ca->disk_sb;
if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
@@ -306,7 +308,7 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field *f;
struct bch_sb_field_members *sb_mi;
- struct bch_member_cpu mi;
+ struct bch_member_cpu mi;
const char *err;
u16 block_size;
@@ -408,7 +410,7 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
return err;
sb_mi = bch_sb_get_members(sb);
- mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx);
+ mi = bch_mi_to_cpu(sb_mi->members + sb->dev_idx);
if (mi.nbuckets > LONG_MAX)
return "Too many buckets";
@@ -434,104 +436,33 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
/* device open: */
-static bool bch_is_open_cache(struct block_device *bdev)
-{
- struct bch_fs *c;
- struct bch_dev *ca;
- unsigned i;
-
- rcu_read_lock();
- list_for_each_entry(c, &bch_fs_list, list)
- for_each_member_device_rcu(ca, c, i)
- if (ca->disk_sb.bdev == bdev) {
- rcu_read_unlock();
- return true;
- }
- rcu_read_unlock();
- return false;
-}
-
-static bool bch_is_open(struct block_device *bdev)
-{
- bool ret;
-
- mutex_lock(&bch_register_lock);
- ret = bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
- mutex_unlock(&bch_register_lock);
-
- return ret;
-}
-
static const char *bch_blkdev_open(const char *path, fmode_t mode,
void *holder, struct block_device **ret)
{
struct block_device *bdev;
- const char *err;
*ret = NULL;
bdev = blkdev_get_by_path(path, mode, holder);
-
- if (bdev == ERR_PTR(-EBUSY)) {
- bdev = lookup_bdev(path);
- if (IS_ERR(bdev))
- return "device busy";
-
- err = bch_is_open(bdev)
- ? "device already registered"
- : "device busy";
-
- bdput(bdev);
- return err;
- }
+ if (bdev == ERR_PTR(-EBUSY))
+ return "device busy";
if (IS_ERR(bdev))
return "failed to open device";
- bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
+ if (mode & FMODE_WRITE)
+ bdev_get_queue(bdev)->backing_dev_info.capabilities
+ |= BDI_CAP_STABLE_WRITES;
*ret = bdev;
return NULL;
}
-/* Update cached mi: */
-int bch_fs_mi_update(struct bch_fs *c, struct bch_member *mi,
- unsigned nr_devices)
-{
- struct bch_member_rcu *new, *old;
- struct bch_dev *ca;
- unsigned i;
-
- lockdep_assert_held(&c->sb_lock);
-
- new = kzalloc(sizeof(struct bch_member_rcu) +
- sizeof(struct bch_member_cpu) * nr_devices,
- GFP_KERNEL);
- if (!new)
- return -ENOMEM;
-
- new->nr_devices = nr_devices;
-
- for (i = 0; i < nr_devices; i++)
- new->m[i] = cache_mi_to_cpu_mi(&mi[i]);
-
- rcu_read_lock();
- for_each_member_device(ca, c, i)
- ca->mi = new->m[i];
- rcu_read_unlock();
-
- old = rcu_dereference_protected(c->members,
- lockdep_is_held(&c->sb_lock));
-
- rcu_assign_pointer(c->members, new);
- if (old)
- kfree_rcu(old, rcu);
-
- return 0;
-}
-
static void bch_sb_update(struct bch_fs *c)
{
struct bch_sb *src = c->disk_sb;
+ struct bch_sb_field_members *mi = bch_sb_get_members(src);
+ struct bch_dev *ca;
+ unsigned i;
lockdep_assert_held(&c->sb_lock);
@@ -548,6 +479,9 @@ static void bch_sb_update(struct bch_fs *c)
c->sb.time_base_lo = le64_to_cpu(src->time_base_lo);
c->sb.time_base_hi = le32_to_cpu(src->time_base_hi);
c->sb.time_precision = le32_to_cpu(src->time_precision);
+
+ for_each_member_device(ca, c, i)
+ ca->mi = bch_mi_to_cpu(mi->members + i);
}
/* doesn't copy member info */
@@ -586,8 +520,6 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
int bch_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
{
- struct bch_sb_field_members *members =
- bch_sb_get_members(src);
struct bch_sb_field_journal *journal_buckets =
bch_sb_get_journal(src);
unsigned journal_u64s = journal_buckets
@@ -599,9 +531,6 @@ int bch_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
if (bch_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s))
return -ENOMEM;
- if (bch_fs_mi_update(c, members->members, src->nr_devices))
- return -ENOMEM;
-
__copy_super(c->disk_sb, src);
bch_sb_update(c);
@@ -784,7 +713,7 @@ static void write_super_endio(struct bio *bio)
bch_account_io_completion(ca);
closure_put(&ca->fs->sb_write);
- percpu_ref_put(&ca->ref);
+ percpu_ref_put(&ca->io_ref);
}
static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
@@ -795,6 +724,9 @@ static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
if (idx >= sb->layout.nr_superblocks)
return false;
+ if (!percpu_ref_tryget(&ca->io_ref))
+ return false;
+
sb->offset = sb->layout.sb_offset[idx];
SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
@@ -812,16 +744,12 @@ static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
bch_bio_map(bio, sb);
- percpu_ref_get(&ca->ref);
closure_bio_submit_punt(bio, &c->sb_write, c);
-
return true;
}
void bch_write_super(struct bch_fs *c)
{
- struct bch_sb_field_members *members =
- bch_sb_get_members(c->disk_sb);
struct closure *cl = &c->sb_write;
struct bch_dev *ca;
unsigned i, super_idx = 0;
@@ -833,7 +761,7 @@ void bch_write_super(struct bch_fs *c)
le64_add_cpu(&c->disk_sb->seq, 1);
- for_each_member_device(ca, c, i)
+ for_each_online_member(ca, c, i)
bch_sb_from_fs(c, ca);
if (c->opts.nochanges)
@@ -841,7 +769,7 @@ void bch_write_super(struct bch_fs *c)
do {
wrote = false;
- for_each_member_device(ca, c, i)
+ for_each_online_member(ca, c, i)
if (write_one_super(c, ca, super_idx))
wrote = true;
@@ -850,7 +778,6 @@ void bch_write_super(struct bch_fs *c)
} while (wrote);
out:
/* Make new options visible after they're persistent: */
- bch_fs_mi_update(c, members->members, c->sb.nr_devices);
bch_sb_update(c);
}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index ed0338cf22a6..1a9bd3092e4c 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -83,7 +83,7 @@ static inline __u64 bset_magic(struct bch_fs *c)
return __le64_to_cpu(bch_sb_magic(c) ^ BSET_MAGIC);
}
-static inline struct bch_member_cpu cache_mi_to_cpu_mi(struct bch_member *mi)
+static inline struct bch_member_cpu bch_mi_to_cpu(struct bch_member *mi)
{
return (struct bch_member_cpu) {
.nbuckets = le64_to_cpu(mi->nbuckets),
@@ -99,8 +99,6 @@ static inline struct bch_member_cpu cache_mi_to_cpu_mi(struct bch_member *mi)
};
}
-int bch_fs_mi_update(struct bch_fs *, struct bch_member *, unsigned);
-
int bch_sb_to_fs(struct bch_fs *, struct bch_sb *);
int bch_sb_from_fs(struct bch_fs *, struct bch_dev *);
@@ -118,27 +116,23 @@ void bch_write_super(struct bch_fs *);
void bch_check_mark_super_slowpath(struct bch_fs *,
const struct bkey_i *, bool);
-#define fs_member_info_get(_c) \
- (rcu_read_lock(), rcu_dereference((_c)->members))
-
-#define fs_member_info_put() rcu_read_unlock()
-
static inline bool bch_check_super_marked(struct bch_fs *c,
const struct bkey_i *k, bool meta)
{
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
const struct bch_extent_ptr *ptr;
- struct bch_member_cpu *mi = fs_member_info_get(c)->m;
unsigned nr_replicas = 0;
bool ret = true;
extent_for_each_ptr(e, ptr) {
+ struct bch_dev *ca = c->devs[ptr->dev];
+
if (ptr->cached)
continue;
if (!(meta
- ? mi[ptr->dev].has_metadata
- : mi[ptr->dev].has_data)) {
+ ? ca->mi.has_metadata
+ : ca->mi.has_data)) {
ret = false;
break;
}
@@ -150,8 +144,6 @@ static inline bool bch_check_super_marked(struct bch_fs *c,
(meta ? c->sb.meta_replicas_have : c->sb.data_replicas_have))
ret = false;
- fs_member_info_put();
-
return ret;
}
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 19c139418790..200b2b31eba0 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -62,28 +62,77 @@ static const uuid_le invalid_uuid = {
};
static struct kset *bcache_kset;
-struct mutex bch_register_lock;
-LIST_HEAD(bch_fs_list);
+static LIST_HEAD(bch_fs_list);
+static DEFINE_MUTEX(bch_fs_list_lock);
static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
struct workqueue_struct *bcache_io_wq;
struct crypto_shash *bch_sha256;
static void bch_dev_free(struct bch_dev *);
-static int bch_dev_online(struct bch_dev *);
+static int bch_dev_alloc(struct bch_fs *, unsigned);
+static int bch_dev_sysfs_online(struct bch_dev *);
+static void __bch_dev_read_only(struct bch_fs *, struct bch_dev *);
-static int bch_congested_fn(void *data, int bdi_bits)
+struct bch_fs *bch_bdev_to_fs(struct block_device *bdev)
+{
+ struct bch_fs *c;
+ struct bch_dev *ca;
+ unsigned i;
+
+ mutex_lock(&bch_fs_list_lock);
+ rcu_read_lock();
+
+ list_for_each_entry(c, &bch_fs_list, list)
+ for_each_member_device_rcu(ca, c, i)
+ if (ca->disk_sb.bdev == bdev) {
+ closure_get(&c->cl);
+ goto found;
+ }
+ c = NULL;
+found:
+ rcu_read_unlock();
+ mutex_unlock(&bch_fs_list_lock);
+
+ return c;
+}
+
+static struct bch_fs *__bch_uuid_to_fs(uuid_le uuid)
+{
+ struct bch_fs *c;
+
+ lockdep_assert_held(&bch_fs_list_lock);
+
+ list_for_each_entry(c, &bch_fs_list, list)
+ if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
+ return c;
+
+ return NULL;
+}
+
+struct bch_fs *bch_uuid_to_fs(uuid_le uuid)
+{
+ struct bch_fs *c;
+
+ mutex_lock(&bch_fs_list_lock);
+ c = __bch_uuid_to_fs(uuid);
+ if (c)
+ closure_get(&c->cl);
+ mutex_unlock(&bch_fs_list_lock);
+
+ return c;
+}
+
+int bch_congested(struct bch_fs *c, int bdi_bits)
{
struct backing_dev_info *bdi;
- struct bch_fs *c = data;
struct bch_dev *ca;
unsigned i;
int ret = 0;
- rcu_read_lock();
if (bdi_bits & (1 << WB_sync_congested)) {
/* Reads - check all devices: */
- for_each_member_device_rcu(ca, c, i) {
+ for_each_readable_member(ca, c, i) {
bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
if (bdi_congested(bdi, bdi_bits)) {
@@ -96,7 +145,8 @@ static int bch_congested_fn(void *data, int bdi_bits)
struct bch_tier *tier = READ_ONCE(c->fastest_tier);
struct dev_group *grp = tier ? &tier->devs : &c->all_devs;
- group_for_each_dev_rcu(ca, grp, i) {
+ rcu_read_lock();
+ group_for_each_dev(ca, grp, i) {
bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
if (bdi_congested(bdi, bdi_bits)) {
@@ -104,12 +154,19 @@ static int bch_congested_fn(void *data, int bdi_bits)
break;
}
}
+ rcu_read_unlock();
}
- rcu_read_unlock();
return ret;
}
+static int bch_congested_fn(void *data, int bdi_bits)
+{
+ struct bch_fs *c = data;
+
+ return bch_congested(c, bdi_bits);
+}
+
/* Filesystem RO/RW: */
/*
@@ -256,10 +313,9 @@ const char *bch_fs_read_write(struct bch_fs *c)
goto out;
err = "error starting allocator thread";
- for_each_member_device(ca, c, i)
- if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
- bch_dev_allocator_start(ca)) {
- percpu_ref_put(&ca->ref);
+ for_each_rw_member(ca, c, i)
+ if (bch_dev_allocator_start(ca)) {
+ percpu_ref_put(&ca->io_ref);
goto err;
}
@@ -268,10 +324,9 @@ const char *bch_fs_read_write(struct bch_fs *c)
goto err;
err = "error starting moving GC thread";
- for_each_member_device(ca, c, i)
- if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
- bch_moving_gc_start(ca)) {
- percpu_ref_put(&ca->ref);
+ for_each_rw_member(ca, c, i)
+ if (bch_moving_gc_start(ca)) {
+ percpu_ref_put(&ca->io_ref);
goto err;
}
@@ -324,7 +379,6 @@ static void bch_fs_free(struct bch_fs *c)
if (c->wq)
destroy_workqueue(c->wq);
- kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */
free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
kfree(c);
module_put(THIS_MODULE);
@@ -353,17 +407,19 @@ static void bch_fs_offline(struct bch_fs *c)
struct bch_dev *ca;
unsigned i;
- mutex_lock(&bch_register_lock);
+ mutex_lock(&bch_fs_list_lock);
list_del(&c->list);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&bch_fs_list_lock);
+
+ for_each_member_device(ca, c, i)
+ if (ca->kobj.state_in_sysfs &&
+ ca->disk_sb.bdev)
+ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
+ "bcache");
if (c->kobj.state_in_sysfs)
kobject_del(&c->kobj);
- for_each_member_device(ca, c, i)
- if (ca->kobj.state_in_sysfs)
- kobject_del(&ca->kobj);
-
bch_fs_debug_exit(c);
bch_fs_chardev_exit(c);
@@ -453,7 +509,6 @@ void bch_fs_stop(struct bch_fs *c)
closure_sync(&c->cl);
bch_fs_exit(c);
- kobject_put(&c->kobj);
}
/* Stop, detaching from backing devices: */
@@ -468,8 +523,9 @@ void bch_fs_detach(struct bch_fs *c)
static struct bch_fs *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
{
+ struct bch_sb_field_members *mi;
struct bch_fs *c;
- unsigned iter_size, journal_entry_bytes;
+ unsigned i, iter_size, journal_entry_bytes;
c = kzalloc(sizeof(struct bch_fs), GFP_KERNEL);
if (!c)
@@ -607,6 +663,12 @@ static struct bch_fs *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->bdi.congested_fn = bch_congested_fn;
c->bdi.congested_data = c;
+ mi = bch_sb_get_members(c->disk_sb);
+ for (i = 0; i < c->sb.nr_devices; i++)
+ if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) &&
+ bch_dev_alloc(c, i))
+ goto err;
+
/*
* Now that all allocations have succeeded, init various refcounty
* things that let us shutdown:
@@ -632,31 +694,19 @@ err:
return NULL;
}
-static struct bch_fs *bch_fs_lookup(uuid_le uuid)
-{
- struct bch_fs *c;
-
- lockdep_assert_held(&bch_register_lock);
-
- list_for_each_entry(c, &bch_fs_list, list)
- if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
- return c;
-
- return NULL;
-}
-
static const char *__bch_fs_online(struct bch_fs *c)
{
struct bch_dev *ca;
+ const char *err = NULL;
unsigned i;
int ret;
- lockdep_assert_held(&bch_register_lock);
+ lockdep_assert_held(&bch_fs_list_lock);
if (!list_empty(&c->list))
return NULL;
- if (bch_fs_lookup(c->sb.uuid))
+ if (__bch_uuid_to_fs(c->sb.uuid))
return "filesystem UUID already open";
ret = bch_fs_chardev_init(c);
@@ -672,35 +722,33 @@ static const char *__bch_fs_online(struct bch_fs *c)
bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
return "error creating sysfs objects";
- for_each_member_device(ca, c, i)
- if (bch_dev_online(ca)) {
- percpu_ref_put(&ca->ref);
- return "error creating sysfs objects";
- }
-
mutex_lock(&c->state_lock);
- if (bch_blockdev_volumes_start(c)) {
- mutex_unlock(&c->state_lock);
- return "can't bring up blockdev volumes";
- }
+ err = "error creating sysfs objects";
+ __for_each_member_device(ca, c, i)
+ if (bch_dev_sysfs_online(ca))
+ goto err;
- bch_attach_backing_devs(c);
+ err = "can't bring up blockdev volumes";
+ if (bch_blockdev_volumes_start(c))
+ goto err;
- mutex_unlock(&c->state_lock);
+ bch_attach_backing_devs(c);
list_add(&c->list, &bch_fs_list);
-
- return 0;
+ err = NULL;
+err:
+ mutex_unlock(&c->state_lock);
+ return err;
}
static const char *bch_fs_online(struct bch_fs *c)
{
const char *err;
- mutex_lock(&bch_register_lock);
+ mutex_lock(&bch_fs_list_lock);
err = __bch_fs_online(c);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&bch_fs_list_lock);
return err;
}
@@ -719,7 +767,7 @@ static const char *__bch_fs_start(struct bch_fs *c)
BUG_ON(c->state != BCH_FS_STARTING);
mutex_lock(&c->sb_lock);
- for_each_member_device(ca, c, i)
+ for_each_online_member(ca, c, i)
bch_sb_from_fs(c, ca);
mutex_unlock(&c->sb_lock);
@@ -728,27 +776,20 @@ static const char *__bch_fs_start(struct bch_fs *c)
if (ret)
goto err;
- pr_debug("btree_journal_read() done");
-
j = &list_entry(journal.prev, struct journal_replay, list)->j;
+ c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
+ c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+
err = "error reading priorities";
- for_each_member_device(ca, c, i) {
+ for_each_readable_member(ca, c, i) {
ret = bch_prio_read(ca);
if (ret) {
- percpu_ref_put(&ca->ref);
+ percpu_ref_put(&ca->io_ref);
goto err;
}
}
- c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
- c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
-
- for_each_member_device(ca, c, i) {
- bch_recalc_min_prio(ca, READ);
- bch_recalc_min_prio(ca, WRITE);
- }
-
for (id = 0; id < BTREE_ID_NR; id++) {
unsigned level;
struct bkey_i *k;
@@ -786,10 +827,9 @@ static const char *__bch_fs_start(struct bch_fs *c)
bch_journal_start(c);
err = "error starting allocator thread";
- for_each_member_device(ca, c, i)
- if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
- bch_dev_allocator_start(ca)) {
- percpu_ref_put(&ca->ref);
+ for_each_rw_member(ca, c, i)
+ if (bch_dev_allocator_start(ca)) {
+ percpu_ref_put(&ca->io_ref);
goto err;
}
@@ -824,9 +864,9 @@ static const char *__bch_fs_start(struct bch_fs *c)
bch_initial_gc(c, NULL);
err = "unable to allocate journal buckets";
- for_each_member_device(ca, c, i)
+ for_each_rw_member(ca, c, i)
if (bch_dev_journal_alloc(ca)) {
- percpu_ref_put(&ca->ref);
+ percpu_ref_put(&ca->io_ref);
goto err;
}
@@ -838,10 +878,9 @@ static const char *__bch_fs_start(struct bch_fs *c)
bch_journal_set_replay_done(&c->journal);
err = "error starting allocator thread";
- for_each_member_device(ca, c, i)
- if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
- bch_dev_allocator_start(ca)) {
- percpu_ref_put(&ca->ref);
+ for_each_rw_member(ca, c, i)
+ if (bch_dev_allocator_start(ca)) {
+ percpu_ref_put(&ca->io_ref);
goto err;
}
@@ -888,10 +927,8 @@ recovery_done:
mi = bch_sb_get_members(c->disk_sb);
now = ktime_get_seconds();
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i)
+ for_each_member_device(ca, c, i)
mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
- rcu_read_unlock();
SET_BCH_SB_INITIALIZED(c->disk_sb, true);
SET_BCH_SB_CLEAN(c->disk_sb, false);
@@ -991,30 +1028,27 @@ void bch_dev_release(struct kobject *kobj)
static void bch_dev_free(struct bch_dev *ca)
{
- struct bch_fs *c = ca->fs;
unsigned i;
cancel_work_sync(&ca->io_error_work);
- if (c && c->kobj.state_in_sysfs) {
- char buf[12];
-
- sprintf(buf, "cache%u", ca->dev_idx);
- sysfs_remove_link(&c->kobj, buf);
- }
+ if (ca->kobj.state_in_sysfs &&
+ ca->disk_sb.bdev)
+ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
+ "bcache");
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
bch_free_super(&ca->disk_sb);
bch_dev_journal_exit(ca);
+
free_percpu(ca->sectors_written);
bioset_exit(&ca->replica_set);
free_percpu(ca->usage_percpu);
free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
kfree(ca->prio_buckets);
kfree(ca->bio_prio);
- kfree(ca->journal.bio);
vfree(ca->buckets);
vfree(ca->oldest_gens);
free_heap(&ca->heap);
@@ -1023,46 +1057,47 @@ static void bch_dev_free(struct bch_dev *ca)
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&ca->free[i]);
+ percpu_ref_exit(&ca->io_ref);
percpu_ref_exit(&ca->ref);
kobject_put(&ca->kobj);
-
- if (c)
- kobject_put(&c->kobj);
}
-static void bch_dev_free_work(struct work_struct *work)
+static void bch_dev_io_ref_release(struct percpu_ref *ref)
{
- struct bch_dev *ca = container_of(work, struct bch_dev, free_work);
+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
- bch_dev_free(ca);
+ complete(&ca->offline_complete);
}
-static void bch_dev_percpu_ref_release(struct percpu_ref *ref)
+static void bch_dev_offline(struct bch_dev *ca)
{
- struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
+ struct bch_fs *c = ca->fs;
+
+ lockdep_assert_held(&c->state_lock);
+
+ __bch_dev_read_only(ca->fs, ca);
+
+ reinit_completion(&ca->offline_complete);
+ percpu_ref_kill(&ca->io_ref);
+ wait_for_completion(&ca->offline_complete);
+
+ if (ca->kobj.state_in_sysfs) {
+ struct kobject *block =
+ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
- schedule_work(&ca->free_work);
+ sysfs_remove_link(block, "bcache");
+ sysfs_remove_link(&ca->kobj, "block");
+ }
+
+ bch_free_super(&ca->disk_sb);
+ bch_dev_journal_exit(ca);
}
-static void bch_dev_free_rcu(struct rcu_head *rcu)
+static void bch_dev_ref_release(struct percpu_ref *ref)
{
- struct bch_dev *ca = container_of(rcu, struct bch_dev, free_rcu);
-
- /*
- * This decrements the ref count to ca, and once the ref count
- * is 0 (outstanding bios to the ca also incremented it and
- * decrement it on completion/error), bch_dev_percpu_ref_release
- * is called, and that eventually results in bch_dev_free_work
- * being called, which in turn results in bch_dev_release being
- * called.
- *
- * In particular, these functions won't be called until there are no
- * bios outstanding (the per-cpu ref counts are all 0), so it
- * is safe to remove the actual sysfs device at that point,
- * and that can indicate success to the user.
- */
+ struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
- percpu_ref_kill(&ca->ref);
+ complete(&ca->stop_complete);
}
static void bch_dev_stop(struct bch_dev *ca)
@@ -1074,26 +1109,44 @@ static void bch_dev_stop(struct bch_dev *ca)
BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
- call_rcu(&ca->free_rcu, bch_dev_free_rcu);
+ synchronize_rcu();
+
+ reinit_completion(&ca->stop_complete);
+ percpu_ref_kill(&ca->ref);
+ wait_for_completion(&ca->stop_complete);
}
-static int bch_dev_online(struct bch_dev *ca)
+static int bch_dev_sysfs_online(struct bch_dev *ca)
{
- char buf[12];
+ struct bch_fs *c = ca->fs;
+ int ret;
- sprintf(buf, "cache%u", ca->dev_idx);
+ if (!c->kobj.state_in_sysfs)
+ return 0;
+
+ if (!ca->kobj.state_in_sysfs) {
+ ret = kobject_add(&ca->kobj, &ca->fs->kobj,
+ "dev-%u", ca->dev_idx);
+ if (ret)
+ return ret;
+ }
- if (kobject_add(&ca->kobj,
- &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
- "bcache") ||
- sysfs_create_link(&ca->kobj, &ca->fs->kobj, "set") ||
- sysfs_create_link(&ca->fs->kobj, &ca->kobj, buf))
- return -1;
+ if (ca->disk_sb.bdev) {
+ struct kobject *block =
+ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
+
+ ret = sysfs_create_link(block, &ca->kobj, "bcache");
+ if (ret)
+ return ret;
+ ret = sysfs_create_link(&ca->kobj, block, "block");
+ if (ret)
+ return ret;
+ }
return 0;
}
-static struct bch_dev *__bch_dev_alloc(struct bcache_superblock *sb)
+static int bch_dev_alloc(struct bch_fs *c, unsigned dev_idx)
{
struct bch_member *member;
size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
@@ -1102,47 +1155,37 @@ static struct bch_dev *__bch_dev_alloc(struct bcache_superblock *sb)
struct bch_dev *ca;
if (bch_fs_init_fault("dev_alloc"))
- return NULL;
+ return -ENOMEM;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
- return NULL;
-
- if (percpu_ref_init(&ca->ref, bch_dev_percpu_ref_release,
- 0, GFP_KERNEL)) {
- kfree(ca);
- return NULL;
- }
+ return -ENOMEM;
kobject_init(&ca->kobj, &bch_dev_ktype);
+ init_completion(&ca->stop_complete);
+ init_completion(&ca->offline_complete);
spin_lock_init(&ca->self.lock);
ca->self.nr = 1;
rcu_assign_pointer(ca->self.d[0].dev, ca);
- ca->dev_idx = sb->sb->dev_idx;
+ ca->dev_idx = dev_idx;
- INIT_WORK(&ca->free_work, bch_dev_free_work);
spin_lock_init(&ca->freelist_lock);
spin_lock_init(&ca->prio_buckets_lock);
mutex_init(&ca->heap_lock);
bch_dev_moving_gc_init(ca);
- ca->disk_sb = *sb;
- if (sb->mode & FMODE_EXCL)
- ca->disk_sb.bdev->bd_holder = ca;
- memset(sb, 0, sizeof(*sb));
-
INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
if (bch_fs_init_fault("dev_alloc"))
goto err;
- member = bch_sb_get_members(ca->disk_sb.sb)->members +
- ca->disk_sb.sb->dev_idx;
+ member = bch_sb_get_members(c->disk_sb)->members + dev_idx;
- ca->mi = cache_mi_to_cpu_mi(member);
+ ca->mi = bch_mi_to_cpu(member);
ca->uuid = member->uuid;
ca->bucket_bits = ilog2(ca->mi.bucket_size);
+ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
/* XXX: tune these */
movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
@@ -1155,7 +1198,11 @@ static struct bch_dev *__bch_dev_alloc(struct bcache_superblock *sb)
free_inc_reserve = movinggc_reserve / 2;
heap_size = movinggc_reserve * 8;
- if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
+ if (percpu_ref_init(&ca->ref, bch_dev_ref_release,
+ 0, GFP_KERNEL) ||
+ percpu_ref_init(&ca->io_ref, bch_dev_io_ref_release,
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+ !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_MOVINGGC],
movinggc_reserve, GFP_KERNEL) ||
@@ -1166,15 +1213,14 @@ static struct bch_dev *__bch_dev_alloc(struct bcache_superblock *sb)
ca->mi.nbuckets)) ||
!(ca->buckets = vzalloc(sizeof(struct bucket) *
ca->mi.nbuckets)) ||
- !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
+ !(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) *
2, GFP_KERNEL)) ||
!(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
!(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
bioset_init(&ca->replica_set, 4,
offsetof(struct bch_write_bio, bio)) ||
- !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
- bch_dev_journal_init(ca))
+ !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
goto err;
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
@@ -1182,75 +1228,76 @@ static struct bch_dev *__bch_dev_alloc(struct bcache_superblock *sb)
total_reserve = ca->free_inc.size;
for (i = 0; i < RESERVE_NR; i++)
total_reserve += ca->free[i].size;
- pr_debug("%zu buckets reserved", total_reserve);
ca->copygc_write_point.group = &ca->self;
ca->tiering_write_point.group = &ca->self;
- return ca;
+ ca->fs = c;
+ rcu_assign_pointer(c->devs[ca->dev_idx], ca);
+
+ if (bch_dev_sysfs_online(ca))
+ pr_warn("error creating sysfs objects");
+
+ return 0;
err:
bch_dev_free(ca);
- return NULL;
+ return -ENOMEM;
}
-static const char *__bch_dev_add(struct bch_fs *c, struct bch_dev *ca)
+static int bch_dev_online(struct bch_fs *c, struct bcache_superblock *sb)
{
- if (c->devs[ca->dev_idx])
- return "already have device online in this slot";
+ struct bch_dev *ca;
+ int ret;
- if (c->sb.nr_devices == 1)
- bdevname(ca->disk_sb.bdev, c->name);
+ lockdep_assert_held(&c->sb_lock);
+
+ if (le64_to_cpu(sb->sb->seq) >
+ le64_to_cpu(c->disk_sb->seq))
+ bch_sb_to_fs(c, sb->sb);
+
+ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
+ !c->devs[sb->sb->dev_idx]);
+
+ ca = c->devs[sb->sb->dev_idx];
+ if (ca->disk_sb.bdev) {
+ bch_err(c, "already have device online in slot %u",
+ sb->sb->dev_idx);
+ return -EINVAL;
+ }
+
+ ret = bch_dev_journal_init(ca, sb->sb);
+ if (ret)
+ return ret;
/*
* Increase journal write timeout if flushes to this device are
* expensive:
*/
- if (!blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev)) &&
+ if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) &&
journal_flushes_device(ca))
c->journal.write_delay_ms =
max(c->journal.write_delay_ms, 1000U);
- kobject_get(&c->kobj);
- ca->fs = c;
+ /* Commit: */
+ ca->disk_sb = *sb;
+ if (sb->mode & FMODE_EXCL)
+ ca->disk_sb.bdev->bd_holder = ca;
+ memset(sb, 0, sizeof(*sb));
- kobject_get(&ca->kobj);
- rcu_assign_pointer(c->devs[ca->dev_idx], ca);
+ if (c->sb.nr_devices == 1)
+ bdevname(ca->disk_sb.bdev, c->name);
+ bdevname(ca->disk_sb.bdev, ca->name);
- if (c->kobj.state_in_sysfs &&
- bch_dev_online(ca))
+ if (bch_dev_sysfs_online(ca))
pr_warn("error creating sysfs objects");
- return NULL;
-}
+ lg_local_lock(&c->usage_lock);
+ if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
+ bch_mark_dev_metadata(ca->fs, ca);
+ lg_local_unlock(&c->usage_lock);
-static const char *bch_dev_alloc(struct bcache_superblock *sb,
- struct bch_fs *c,
- struct bch_dev **ret)
-{
- struct bch_dev *ca;
- const char *err;
-
- ca = __bch_dev_alloc(sb);
- if (!ca)
- return "cannot allocate memory";
-
- err = __bch_dev_add(c, ca);
- if (err) {
- bch_dev_free(ca);
- return err;
- }
-
- mutex_lock(&c->sb_lock);
- if (le64_to_cpu(ca->disk_sb.sb->seq) >
- le64_to_cpu(c->disk_sb->seq))
- bch_sb_to_fs(c, ca->disk_sb.sb);
- mutex_unlock(&c->sb_lock);
-
- if (ret)
- *ret = ca;
- else
- kobject_put(&ca->kobj);
- return NULL;
+ percpu_ref_reinit(&ca->io_ref);
+ return 0;
}
/* Device management: */
@@ -1304,7 +1351,7 @@ bool bch_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
{
lockdep_assert_held(&c->state_lock);
- if (new_state == BCH_MEMBER_STATE_ACTIVE)
+ if (new_state == BCH_MEMBER_STATE_RW)
return true;
if (ca->mi.has_data &&
@@ -1346,8 +1393,7 @@ static const char *__bch_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
- if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
- return NULL;
+ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
trace_bcache_cache_read_write(ca);
@@ -1370,7 +1416,6 @@ int __bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags)
{
struct bch_sb_field_members *mi;
- char buf[BDEVNAME_SIZE];
if (ca->mi.state == new_state)
return 0;
@@ -1378,16 +1423,14 @@ int __bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
if (!bch_dev_state_allowed(c, ca, new_state, flags))
return -EINVAL;
- if (new_state == BCH_MEMBER_STATE_ACTIVE) {
+ if (new_state == BCH_MEMBER_STATE_RW) {
if (__bch_dev_read_write(c, ca))
return -ENOMEM;
} else {
__bch_dev_read_only(c, ca);
}
- bch_notice(c, "%s %s",
- bdevname(ca->disk_sb.bdev, buf),
- bch_dev_state[new_state]);
+ bch_notice(ca, "%s", bch_dev_state[new_state]);
mutex_lock(&c->sb_lock);
mi = bch_sb_get_members(c->disk_sb);
@@ -1448,20 +1491,17 @@ int bch_dev_migrate_from(struct bch_fs *c, struct bch_dev *ca)
static int __bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_sb_field_members *mi;
- char name[BDEVNAME_SIZE];
unsigned dev_idx = ca->dev_idx;
int ret;
- bdevname(ca->disk_sb.bdev, name);
-
- if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) {
- bch_err(ca->fs, "Cannot remove RW device");
+ if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+ bch_err(ca, "Cannot remove RW device");
bch_notify_dev_remove_failed(ca);
return -EINVAL;
}
if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
- bch_err(ca->fs, "Cannot remove %s without losing data", name);
+ bch_err(ca, "Cannot remove without losing data");
bch_notify_dev_remove_failed(ca);
return -EINVAL;
}
@@ -1473,7 +1513,12 @@ static int __bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
*/
ret = bch_flag_data_bad(ca);
if (ret) {
- bch_err(c, "Remove of %s failed", name);
+ bch_err(ca, "Remove failed");
+ return ret;
+ }
+
+ if (ca->mi.has_data || ca->mi.has_metadata) {
+ bch_err(ca, "Can't remove, still has data");
return ret;
}
@@ -1489,13 +1534,9 @@ static int __bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
bch_journal_meta(&c->journal);
+ bch_dev_offline(ca);
bch_dev_stop(ca);
-
- /*
- * RCU barrier between dropping between c->dev and dropping from
- * member info:
- */
- synchronize_rcu();
+ bch_dev_free(ca);
/*
* Free this device's slot in the bch_member array - all pointers to
@@ -1517,6 +1558,7 @@ int bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
int ret;
mutex_lock(&c->state_lock);
+ percpu_ref_put(&ca->ref);
ret = __bch_dev_remove(c, ca, flags);
mutex_unlock(&c->state_lock);
@@ -1556,18 +1598,9 @@ int bch_dev_add(struct bch_fs *c, const char *path)
saved_mi = dev_mi->members[sb.sb->dev_idx];
saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
- /*
- * XXX: ditch the GC stuff, just don't remove a device until nothing is
- * using its dev_idx anymore
- */
- down_read(&c->gc_lock);
-
if (dynamic_fault("bcache:add:no_slot"))
goto no_slot;
- if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
- goto no_slot;
-
mi = bch_sb_get_members(c->disk_sb);
for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
if (dev_idx >= c->sb.nr_devices ||
@@ -1575,15 +1608,11 @@ int bch_dev_add(struct bch_fs *c, const char *path)
sizeof(uuid_le)))
goto have_slot;
no_slot:
- up_read(&c->gc_lock);
-
err = "no slots available in superblock";
ret = -ENOSPC;
goto err_unlock;
have_slot:
- up_read(&c->gc_lock);
-
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
u64s = (sizeof(struct bch_sb_field_members) +
sizeof(struct bch_member) * nr_devices) / sizeof(u64);
@@ -1604,53 +1633,44 @@ have_slot:
sb.sb->dev_idx = dev_idx;
sb.sb->nr_devices = nr_devices;
- if (bch_fs_mi_update(c, dev_mi->members, nr_devices)) {
- err = "cannot allocate memory";
- ret = -ENOMEM;
- goto err_unlock;
- }
-
/* commit new member info */
memcpy(mi, dev_mi, u64s * sizeof(u64));
c->disk_sb->nr_devices = nr_devices;
c->sb.nr_devices = nr_devices;
- ca = __bch_dev_alloc(&sb);
- if (!ca) {
+ if (bch_dev_alloc(c, dev_idx)) {
err = "cannot allocate memory";
ret = -ENOMEM;
goto err_unlock;
}
- bch_dev_mark_superblocks(ca);
-
- err = "journal alloc failed";
- if (bch_dev_journal_alloc(ca))
+ if (bch_dev_online(c, &sb)) {
+ err = "bch_dev_online() error";
+ ret = -ENOMEM;
goto err_unlock;
-
- err = __bch_dev_add(c, ca);
- BUG_ON(err);
+ }
bch_write_super(c);
mutex_unlock(&c->sb_lock);
- if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) {
+ ca = c->devs[dev_idx];
+ if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+ err = "journal alloc failed";
+ if (bch_dev_journal_alloc(ca))
+ goto err;
+
err = __bch_dev_read_write(c, ca);
if (err)
goto err;
}
bch_notify_dev_added(ca);
-
- kobject_put(&ca->kobj);
mutex_unlock(&c->state_lock);
return 0;
err_unlock:
mutex_unlock(&c->sb_lock);
err:
mutex_unlock(&c->state_lock);
- if (ca)
- bch_dev_stop(ca);
bch_free_super(&sb);
bch_err(c, "Unable to add device: %s", err);
@@ -1708,11 +1728,14 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
if (!c)
goto err;
- for (i = 0; i < nr_devices; i++) {
- err = bch_dev_alloc(&sb[i], c, NULL);
- if (err)
+ err = "bch_dev_online() error";
+ mutex_lock(&c->sb_lock);
+ for (i = 0; i < nr_devices; i++)
+ if (bch_dev_online(c, &sb[i])) {
+ mutex_unlock(&c->sb_lock);
goto err;
- }
+ }
+ mutex_unlock(&c->sb_lock);
err = "insufficient devices";
if (!bch_fs_may_start(c, 0))
@@ -1760,8 +1783,8 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
if (err)
return err;
- mutex_lock(&bch_register_lock);
- c = bch_fs_lookup(sb->sb->uuid);
+ mutex_lock(&bch_fs_list_lock);
+ c = __bch_uuid_to_fs(sb->sb->uuid);
if (c) {
closure_get(&c->cl);
@@ -1777,9 +1800,14 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
allocated_fs = true;
}
- err = bch_dev_alloc(sb, c, NULL);
- if (err)
+ err = "bch_dev_online() error";
+
+ mutex_lock(&c->sb_lock);
+ if (bch_dev_online(c, sb)) {
+ mutex_unlock(&c->sb_lock);
goto err;
+ }
+ mutex_unlock(&c->sb_lock);
if (!c->opts.nostart && bch_fs_may_start(c, 0)) {
err = __bch_fs_start(c);
@@ -1792,11 +1820,11 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
goto err;
closure_put(&c->cl);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&bch_fs_list_lock);
return NULL;
err:
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&bch_fs_list_lock);
if (allocated_fs)
bch_fs_stop(c);
@@ -1817,9 +1845,9 @@ const char *bch_fs_open_incremental(const char *path)
return err;
if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) {
- mutex_lock(&bch_register_lock);
+ mutex_lock(&bch_fs_list_lock);
err = bch_backing_dev_register(&sb);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&bch_fs_list_lock);
} else {
err = __bch_fs_open_incremental(&sb, opts);
}
@@ -1878,7 +1906,7 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
code == SYS_POWER_OFF) {
struct bch_fs *c;
- mutex_lock(&bch_register_lock);
+ mutex_lock(&bch_fs_list_lock);
if (!list_empty(&bch_fs_list))
pr_info("Setting all devices read only:");
@@ -1889,7 +1917,7 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
list_for_each_entry(c, &bch_fs_list, list)
bch_fs_read_only(c);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&bch_fs_list_lock);
}
return NOTIFY_DONE;
@@ -1933,7 +1961,6 @@ static int __init bcache_init(void)
NULL
};
- mutex_init(&bch_register_lock);
register_reboot_notifier(&reboot);
bkey_pack_test();
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 13fb0e6b42e3..53026cb73696 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -20,42 +20,79 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
return s & (ca->mi.bucket_size - 1);
}
-static inline struct bch_dev *bch_next_cache_rcu(struct bch_fs *c,
- unsigned *iter)
+static inline struct bch_dev *__bch_next_dev(struct bch_fs *c, unsigned *iter)
{
- struct bch_dev *ret = NULL;
+ struct bch_dev *ca = NULL;
while (*iter < c->sb.nr_devices &&
- !(ret = rcu_dereference(c->devs[*iter])))
+ !(ca = rcu_dereference_check(c->devs[*iter],
+ lockdep_is_held(&c->state_lock))))
(*iter)++;
- return ret;
+ return ca;
}
+#define __for_each_member_device(ca, c, iter) \
+ for ((iter) = 0; ((ca) = __bch_next_dev((c), &(iter))); (iter)++)
+
#define for_each_member_device_rcu(ca, c, iter) \
- for ((iter) = 0; ((ca) = bch_next_cache_rcu((c), &(iter))); (iter)++)
+ __for_each_member_device(ca, c, iter)
-static inline struct bch_dev *bch_get_next_cache(struct bch_fs *c,
- unsigned *iter)
+static inline struct bch_dev *bch_get_next_dev(struct bch_fs *c, unsigned *iter)
{
- struct bch_dev *ret;
+ struct bch_dev *ca;
rcu_read_lock();
- if ((ret = bch_next_cache_rcu(c, iter)))
- percpu_ref_get(&ret->ref);
+ if ((ca = __bch_next_dev(c, iter)))
+ percpu_ref_get(&ca->ref);
rcu_read_unlock();
- return ret;
+ return ca;
}
/*
* If you break early, you must drop your ref on the current device
*/
-#define for_each_member_device(ca, c, iter) \
+#define for_each_member_device(ca, c, iter) \
for ((iter) = 0; \
- (ca = bch_get_next_cache(c, &(iter))); \
+ (ca = bch_get_next_dev(c, &(iter))); \
percpu_ref_put(&ca->ref), (iter)++)
+static inline struct bch_dev *bch_get_next_online_dev(struct bch_fs *c,
+ unsigned *iter,
+ int state_mask)
+{
+ struct bch_dev *ca;
+
+ rcu_read_lock();
+ while ((ca = __bch_next_dev(c, iter)) &&
+ (!((1 << ca->mi.state) & state_mask) ||
+ !percpu_ref_tryget(&ca->io_ref)))
+ (*iter)++;
+ rcu_read_unlock();
+
+ return ca;
+}
+
+#define __for_each_online_member(ca, c, iter, state_mask) \
+ for ((iter) = 0; \
+ (ca = bch_get_next_online_dev(c, &(iter), state_mask)); \
+ percpu_ref_put(&ca->io_ref), (iter)++)
+
+#define for_each_online_member(ca, c, iter) \
+ __for_each_online_member(ca, c, iter, ~0)
+
+#define for_each_rw_member(ca, c, iter) \
+ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW)
+
+#define for_each_readable_member(ca, c, iter) \
+ __for_each_online_member(ca, c, iter, \
+ (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
+
+struct bch_fs *bch_bdev_to_fs(struct block_device *);
+struct bch_fs *bch_uuid_to_fs(uuid_le);
+int bch_congested(struct bch_fs *, int);
+
void bch_dev_release(struct kobject *);
bool bch_dev_state_allowed(struct bch_fs *, struct bch_dev *,
@@ -84,8 +121,6 @@ const char *bch_fs_open(char * const *, unsigned, struct bch_opts,
struct bch_fs **);
const char *bch_fs_open_incremental(const char *path);
-extern struct mutex bch_register_lock;
-extern struct list_head bch_fs_list;
extern struct workqueue_struct *bcache_io_wq;
extern struct crypto_shash *bch_sha256;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 14675c2b721f..91897671b52d 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -206,12 +206,10 @@ SHOW(bch_cached_dev)
return 0;
}
-STORE(__cached_dev)
+STORE(bch_cached_dev)
{
struct cached_dev *dc = container_of(kobj, struct cached_dev,
disk.kobj);
- unsigned v = size;
- struct bch_fs *c;
struct kobj_uevent_env *env;
#define d_strtoul(var) sysfs_strtoul(var, dc->var)
@@ -228,6 +226,13 @@ STORE(__cached_dev)
d_strtoi_h(sequential_cutoff);
d_strtoi_h(readahead);
+ if (attr == &sysfs_writeback_running)
+ bch_writeback_queue(dc);
+
+ if (attr == &sysfs_writeback_percent)
+ schedule_delayed_work(&dc->writeback_pd_update,
+ dc->writeback_pd_update_seconds * HZ);
+
if (attr == &sysfs_clear_stats)
bch_cache_accounting_clear(&dc->accounting);
@@ -295,17 +300,25 @@ STORE(__cached_dev)
}
if (attr == &sysfs_attach) {
- if (uuid_parse(buf, &dc->disk_sb.sb->user_uuid))
+ struct bch_fs *c;
+ uuid_le uuid;
+ int ret;
+
+ if (uuid_parse(buf, &uuid))
return -EINVAL;
- list_for_each_entry(c, &bch_fs_list, list) {
- v = bch_cached_dev_attach(dc, c);
- if (!v)
- return size;
+ c = bch_uuid_to_fs(uuid);
+ if (!c) {
+ pr_err("Can't attach %s: cache set not found", buf);
+ return -ENOENT;
}
- pr_err("Can't attach %s: cache set not found", buf);
- size = v;
+ dc->disk_sb.sb->set_uuid = uuid;
+
+ ret = bch_cached_dev_attach(dc, c);
+ closure_put(&c->cl);
+ if (ret)
+ return ret;
}
if (attr == &sysfs_detach && dc->disk.c)
@@ -317,25 +330,6 @@ STORE(__cached_dev)
return size;
}
-STORE(bch_cached_dev)
-{
- struct cached_dev *dc = container_of(kobj, struct cached_dev,
- disk.kobj);
-
- mutex_lock(&bch_register_lock);
- size = __cached_dev_store(kobj, attr, buf, size);
-
- if (attr == &sysfs_writeback_running)
- bch_writeback_queue(dc);
-
- if (attr == &sysfs_writeback_percent)
- schedule_delayed_work(&dc->writeback_pd_update,
- dc->writeback_pd_update_seconds * HZ);
-
- mutex_unlock(&bch_register_lock);
- return size;
-}
-
static struct attribute *bch_cached_dev_files[] = {
&sysfs_attach,
&sysfs_detach,
@@ -380,7 +374,7 @@ SHOW(bch_blockdev_volume)
return 0;
}
-STORE(__bch_blockdev_volume)
+STORE(bch_blockdev_volume)
{
struct bcache_device *d = container_of(kobj, struct bcache_device,
kobj);
@@ -438,7 +432,6 @@ STORE(__bch_blockdev_volume)
return size;
}
-STORE_LOCKED(bch_blockdev_volume)
static struct attribute *bch_blockdev_volume_files[] = {
&sysfs_unregister,
@@ -1224,7 +1217,7 @@ SHOW(bch_dev)
return 0;
}
-STORE(__bch_dev)
+STORE(bch_dev)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
struct bch_fs *c = ca->fs;
@@ -1300,7 +1293,6 @@ STORE(__bch_dev)
return size;
}
-STORE_LOCKED(bch_dev)
static struct attribute *bch_dev_files[] = {
&sysfs_uuid,
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
index 9d5845874931..02700246acaf 100644
--- a/fs/bcachefs/sysfs.h
+++ b/fs/bcachefs/sysfs.h
@@ -21,16 +21,6 @@ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
const char *buf, size_t size) \
-#define STORE_LOCKED(fn) \
-STORE(fn) \
-{ \
- ssize_t ret; \
- mutex_lock(&bch_register_lock); \
- ret = __ ## fn ## _store(kobj, attr, buf, size); \
- mutex_unlock(&bch_register_lock); \
- return ret; \
-}
-
#define __sysfs_attribute(_name, _mode) \
static struct attribute sysfs_##_name = \
{ .name = #_name, .mode = _mode }
diff --git a/fs/bcachefs/tier.c b/fs/bcachefs/tier.c
index 1d6e06519483..b1ac13c99275 100644
--- a/fs/bcachefs/tier.c
+++ b/fs/bcachefs/tier.c
@@ -30,7 +30,6 @@ static bool tiering_pred(struct bch_fs *c,
if (bkey_extent_is_data(k.k)) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
- struct bch_member_rcu *mi;
unsigned replicas = 0;
/* Make sure we have room to add a new pointer: */
@@ -38,12 +37,9 @@ static bool tiering_pred(struct bch_fs *c,
BKEY_EXTENT_VAL_U64s_MAX)
return false;
- mi = fs_member_info_get(c);
extent_for_each_ptr(e, ptr)
- if (ptr->dev < mi->nr_devices &&
- mi->m[ptr->dev].tier >= s->tier->idx)
+ if (c->devs[ptr->dev]->mi.tier >= s->tier->idx)
replicas++;
- fs_member_info_put();
return replicas < c->opts.data_replicas;
}
@@ -54,7 +50,7 @@ static bool tiering_pred(struct bch_fs *c,
static void tier_put_device(struct tiering_state *s)
{
if (s->ca)
- percpu_ref_put(&s->ca->ref);
+ percpu_ref_put(&s->ca->io_ref);
s->ca = NULL;
}
@@ -74,7 +70,7 @@ static void tier_next_device(struct bch_fs *c, struct tiering_state *s)
if (s->tier->devs.nr) {
s->ca = s->tier->devs.d[s->dev_idx].dev;
- percpu_ref_get(&s->ca->ref);
+ percpu_ref_get(&s->ca->io_ref);
}
spin_unlock(&s->tier->devs.lock);
}
@@ -183,19 +179,19 @@ static int bch_tiering_thread(void *arg)
last = atomic_long_read(&clock->now);
tier_capacity = available_sectors = 0;
- rcu_read_lock();
for (faster_tier = c->tiers;
faster_tier != tier;
faster_tier++) {
- group_for_each_dev_rcu(ca, &faster_tier->devs, i) {
+ spin_lock(&faster_tier->devs.lock);
+ group_for_each_dev(ca, &faster_tier->devs, i) {
tier_capacity +=
(ca->mi.nbuckets -
ca->mi.first_bucket) << ca->bucket_bits;
available_sectors +=
dev_buckets_available(ca) << ca->bucket_bits;
}
+ spin_unlock(&faster_tier->devs.lock);
}
- rcu_read_unlock();
if (available_sectors < (tier_capacity >> 1))
break;