diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2017-03-10 07:08:39 -0900 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-03-17 19:49:23 -0800 |
commit | a5b2efedf8485ee4a36c736cf6cfe907c0db91c5 (patch) | |
tree | 8dc1073413af469ad03da7132a5d7354aa40dc85 | |
parent | de9690db2991d5d3a1f88211e9ef46c3b5a5dae4 (diff) |
bcachefs: Rework struct bch_dev lifetime
Allocate all member devices when allocating struct bch_fs, not when they
come online - this will let us handle running in degraded mode better,
and ends up simplifying things a good bit.
33 files changed, 686 insertions, 930 deletions
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c index 5937c292b7cb..5bd6de9fb05c 100644 --- a/fs/bcachefs/alloc.c +++ b/fs/bcachefs/alloc.c @@ -74,6 +74,7 @@ #include <trace/events/bcachefs.h> static void __bch_bucket_free(struct bch_dev *, struct bucket *); +static void bch_recalc_min_prio(struct bch_dev *, int); /* Allocation groups: */ @@ -84,7 +85,7 @@ void bch_dev_group_remove(struct dev_group *grp, struct bch_dev *ca) spin_lock(&grp->lock); for (i = 0; i < grp->nr; i++) - if (rcu_access_pointer(grp->d[i].dev) == ca) { + if (grp->d[i].dev == ca) { grp->nr--; memmove(&grp->d[i], &grp->d[i + 1], @@ -101,12 +102,12 @@ void bch_dev_group_add(struct dev_group *grp, struct bch_dev *ca) spin_lock(&grp->lock); for (i = 0; i < grp->nr; i++) - if (rcu_access_pointer(grp->d[i].dev) == ca) + if (grp->d[i].dev == ca) goto out; BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX); - rcu_assign_pointer(grp->d[grp->nr++].dev, ca); + grp->d[grp->nr++].dev = ca; out: spin_unlock(&grp->lock); } @@ -137,7 +138,8 @@ static void pd_controllers_update(struct work_struct *work) faster_tiers_dirty, -1); - group_for_each_dev_rcu(ca, &c->tiers[i].devs, iter) { + spin_lock(&c->tiers[i].devs.lock); + group_for_each_dev(ca, &c->tiers[i].devs, iter) { struct bch_dev_usage stats = bch_dev_usage_read(ca); unsigned bucket_bits = ca->bucket_bits + 9; @@ -172,6 +174,7 @@ static void pd_controllers_update(struct work_struct *work) copygc_can_free += fragmented; } + spin_unlock(&c->tiers[i].devs.lock); } rcu_read_unlock(); @@ -441,8 +444,15 @@ int bch_prio_read(struct bch_dev *ca) bucket_cmpxchg(&ca->buckets[b], new, new.gen = d->gen); } + + mutex_lock(&c->bucket_lock); + bch_recalc_min_prio(ca, READ); + bch_recalc_min_prio(ca, WRITE); + mutex_unlock(&c->bucket_lock); + + ret = 0; fsck_err: - return 0; + return ret; } #define BUCKET_GC_GEN_MAX 96U @@ -520,6 +530,8 @@ void bch_recalc_min_prio(struct bch_dev *ca, int rw) u16 max_delta = 1; unsigned i; + lockdep_assert_held(&c->bucket_lock); + /* Determine min prio for this particular cache */ for_each_bucket(g, ca) max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw])); @@ -821,8 +833,8 @@ static void bch_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca) spin_lock(&ca->freelist_lock); bch_mark_alloc_bucket(ca, g, true); - g->read_prio = ca->fs->prio_clock[READ].hand; - g->write_prio = ca->fs->prio_clock[WRITE].hand; + g->read_prio = c->prio_clock[READ].hand; + g->write_prio = c->prio_clock[WRITE].hand; verify_not_on_freelist(ca, g - ca->buckets); BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); @@ -1058,7 +1070,6 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct bch_fs *c, if (ob->nr_ptrs >= nr_replicas) return ALLOC_SUCCESS; - rcu_read_lock(); spin_lock(&devs->lock); for (i = 0; i < devs->nr; i++) @@ -1128,7 +1139,6 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct bch_fs *c, err: EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC); spin_unlock(&devs->lock); - rcu_read_unlock(); return ret; } @@ -1223,14 +1233,14 @@ static int bch_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, static void __bch_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { const struct bch_extent_ptr *ptr; - struct bch_dev *ca; lockdep_assert_held(&c->open_buckets_lock); - rcu_read_lock(); - open_bucket_for_each_online_device(c, ob, ptr, ca) + open_bucket_for_each_ptr(ob, ptr) { + struct bch_dev *ca = c->devs[ptr->dev]; + bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false); - rcu_read_unlock(); + } ob->nr_ptrs = 0; @@ -1283,12 +1293,13 @@ static struct open_bucket *bch_open_bucket_get(struct bch_fs *c, return ret; } -static unsigned ob_ptr_sectors_free(struct open_bucket *ob, - struct bch_member_rcu *mi, +static unsigned ob_ptr_sectors_free(struct bch_fs *c, + struct open_bucket *ob, struct bch_extent_ptr *ptr) { + struct bch_dev *ca = c->devs[ptr->dev]; unsigned i = ptr - ob->ptrs; - unsigned bucket_size = mi->m[ptr->dev].bucket_size; + unsigned bucket_size = ca->mi.bucket_size; unsigned used = (ptr->offset & (bucket_size - 1)) + ob->ptr_offset[i]; @@ -1301,14 +1312,11 @@ static unsigned open_bucket_sectors_free(struct bch_fs *c, struct open_bucket *ob, unsigned nr_replicas) { - struct bch_member_rcu *mi = fs_member_info_get(c); unsigned i, sectors_free = UINT_MAX; for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++) sectors_free = min(sectors_free, - ob_ptr_sectors_free(ob, mi, &ob->ptrs[i])); - - fs_member_info_put(); + ob_ptr_sectors_free(c, ob, &ob->ptrs[i])); return sectors_free != UINT_MAX ? sectors_free : 0; } @@ -1317,11 +1325,10 @@ static void open_bucket_copy_unused_ptrs(struct bch_fs *c, struct open_bucket *new, struct open_bucket *old) { - struct bch_member_rcu *mi = fs_member_info_get(c); unsigned i; for (i = 0; i < old->nr_ptrs; i++) - if (ob_ptr_sectors_free(old, mi, &old->ptrs[i])) { + if (ob_ptr_sectors_free(c, old, &old->ptrs[i])) { struct bch_extent_ptr tmp = old->ptrs[i]; tmp.offset += old->ptr_offset[i]; @@ -1329,19 +1336,18 @@ static void open_bucket_copy_unused_ptrs(struct bch_fs *c, new->ptr_offset[new->nr_ptrs] = 0; new->nr_ptrs++; } - fs_member_info_put(); } static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob) { #ifdef CONFIG_BCACHEFS_DEBUG const struct bch_extent_ptr *ptr; - struct bch_dev *ca; - rcu_read_lock(); - open_bucket_for_each_online_device(c, ob, ptr, ca) + open_bucket_for_each_ptr(ob, ptr) { + struct bch_dev *ca = c->devs[ptr->dev]; + BUG_ON(ptr_stale(ca, ptr)); - rcu_read_unlock(); + } #endif } @@ -1485,7 +1491,6 @@ void bch_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, unsigned sectors) { struct bch_extent_ptr tmp; - struct bch_dev *ca; bool has_data = false; unsigned i; @@ -1500,8 +1505,6 @@ void bch_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, if (nr_replicas < ob->nr_ptrs) has_data = true; - rcu_read_lock(); - for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) { EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev)); @@ -1512,11 +1515,8 @@ void bch_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, ob->ptr_offset[i] += sectors; - if ((ca = PTR_DEV(c, &ob->ptrs[i]))) - this_cpu_add(*ca->sectors_written, sectors); + this_cpu_add(*c->devs[tmp.dev]->sectors_written, sectors); } - - rcu_read_unlock(); } /* @@ -1526,19 +1526,16 @@ void bch_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, void bch_alloc_sectors_done(struct bch_fs *c, struct write_point *wp, struct open_bucket *ob) { - struct bch_member_rcu *mi = fs_member_info_get(c); bool has_data = false; unsigned i; for (i = 0; i < ob->nr_ptrs; i++) { - if (!ob_ptr_sectors_free(ob, mi, &ob->ptrs[i])) + if (!ob_ptr_sectors_free(c, ob, &ob->ptrs[i])) ob->has_full_ptrs = true; else has_data = true; } - fs_member_info_put(); - if (likely(has_data)) atomic_inc(&ob->pin); else @@ -1600,8 +1597,7 @@ void bch_recalc_capacity(struct bch_fs *c) unsigned long ra_pages = 0; unsigned i, j; - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i) { + for_each_online_member(ca, c, i) { struct backing_dev_info *bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); @@ -1632,7 +1628,8 @@ void bch_recalc_capacity(struct bch_fs *c) * Capacity of the filesystem is the capacity of all the devices in the * slowest (highest) tier - we don't include lower tier devices. */ - group_for_each_dev_rcu(ca, &slowest_tier->devs, i) { + spin_lock(&slowest_tier->devs.lock); + group_for_each_dev(ca, &slowest_tier->devs, i) { size_t reserve = 0; /* @@ -1668,8 +1665,8 @@ void bch_recalc_capacity(struct bch_fs *c) ca->mi.first_bucket) << ca->bucket_bits; } + spin_unlock(&slowest_tier->devs.lock); set_capacity: - rcu_read_unlock(); total_capacity = capacity; capacity *= (100 - c->opts.gc_reserve_percent); @@ -1828,6 +1825,8 @@ int bch_dev_allocator_start(struct bch_dev *ca) { struct bch_fs *c = ca->fs; struct dev_group *tier = &c->tiers[ca->mi.tier].devs; + struct bch_sb_field_journal *journal_buckets; + bool has_journal; struct task_struct *k; /* @@ -1845,7 +1844,15 @@ int bch_dev_allocator_start(struct bch_dev *ca) bch_dev_group_add(tier, ca); bch_dev_group_add(&c->all_devs, ca); - bch_dev_group_add(&c->journal.devs, ca); + + mutex_lock(&c->sb_lock); + journal_buckets = bch_sb_get_journal(ca->disk_sb.sb); + has_journal = bch_nr_journal_buckets(journal_buckets) >= + BCH_JOURNAL_BUCKETS_MIN; + mutex_unlock(&c->sb_lock); + + if (has_journal) + bch_dev_group_add(&c->journal.devs, ca); bch_recalc_capacity(c); diff --git a/fs/bcachefs/alloc.h b/fs/bcachefs/alloc.h index bd50fec8f3c7..f8aa762de2e0 100644 --- a/fs/bcachefs/alloc.h +++ b/fs/bcachefs/alloc.h @@ -25,8 +25,6 @@ void bch_dev_group_add(struct dev_group *, struct bch_dev *); int bch_prio_read(struct bch_dev *); -void bch_recalc_min_prio(struct bch_dev *, int); - size_t bch_bucket_alloc(struct bch_dev *, enum alloc_reserve); void bch_open_bucket_put(struct bch_fs *, struct open_bucket *); @@ -56,54 +54,27 @@ static inline void bch_wake_allocator(struct bch_dev *ca) rcu_read_unlock(); } -static inline struct bch_dev *dev_group_next_rcu(struct dev_group *devs, - unsigned *iter) +static inline struct bch_dev *dev_group_next(struct dev_group *devs, + unsigned *iter) { struct bch_dev *ret = NULL; while (*iter < devs->nr && - !(ret = rcu_dereference(devs->d[*iter].dev))) + !(ret = rcu_dereference_check(devs->d[*iter].dev, + lockdep_is_held(&devs->lock)))) (*iter)++; return ret; } -#define group_for_each_dev_rcu(ca, devs, iter) \ +#define group_for_each_dev(ca, devs, iter) \ for ((iter) = 0; \ - ((ca) = dev_group_next_rcu((devs), &(iter))); \ + ((ca) = dev_group_next((devs), &(iter))); \ (iter)++) -static inline struct bch_dev *dev_group_next(struct dev_group *devs, - unsigned *iter) -{ - struct bch_dev *ret; - - rcu_read_lock(); - if ((ret = dev_group_next_rcu(devs, iter))) - percpu_ref_get(&ret->ref); - rcu_read_unlock(); - - return ret; -} - -#define group_for_each_dev(ca, devs, iter) \ - for ((iter) = 0; \ - (ca = dev_group_next(devs, &(iter))); \ - percpu_ref_put(&ca->ref), (iter)++) - -#define __open_bucket_next_online_device(_c, _ob, _ptr, _ca) \ -({ \ - (_ca) = NULL; \ - \ - while ((_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs && \ - !((_ca) = PTR_DEV(_c, _ptr))) \ - (_ptr)++; \ - (_ca); \ -}) - -#define open_bucket_for_each_online_device(_c, _ob, _ptr, _ca) \ +#define open_bucket_for_each_ptr(_ob, _ptr) \ for ((_ptr) = (_ob)->ptrs; \ - ((_ca) = __open_bucket_next_online_device(_c, _ob, _ptr, _ca));\ + (_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs; \ (_ptr)++) void bch_recalc_capacity(struct bch_fs *); diff --git a/fs/bcachefs/bcache.h b/fs/bcachefs/bcache.h index af2cfc5fe77d..dd9e3b8253ff 100644 --- a/fs/bcachefs/bcache.h +++ b/fs/bcachefs/bcache.h @@ -317,7 +317,8 @@ struct crypto_blkcipher; struct crypto_ahash; enum gc_phase { - GC_PHASE_PENDING_DELETE = BTREE_ID_NR + 1, + GC_PHASE_SB_METADATA = BTREE_ID_NR + 1, + GC_PHASE_PENDING_DELETE, GC_PHASE_DONE }; @@ -340,21 +341,15 @@ struct bch_member_cpu { u8 valid; }; -struct bch_member_rcu { - struct rcu_head rcu; - unsigned nr_devices; - struct bch_member_cpu m[]; -}; - struct bch_dev { + struct kobject kobj; struct percpu_ref ref; - struct rcu_head free_rcu; - struct work_struct free_work; + struct percpu_ref io_ref; + struct completion stop_complete; + struct completion offline_complete; struct bch_fs *fs; - struct dev_group self; - u8 dev_idx; /* * Cached version of this device's member info from superblock @@ -362,10 +357,11 @@ struct bch_dev { */ struct bch_member_cpu mi; uuid_le uuid; + char name[BDEVNAME_SIZE]; struct bcache_superblock disk_sb; - struct kobject kobj; + struct dev_group self; /* biosets used in cloned bios for replicas and moving_gc */ struct bio_set replica_set; @@ -517,12 +513,6 @@ struct bch_fs { struct bch_opts opts; - /* - * Cached copy in native endianness: - * Set by bch_fs_mi_update(): - */ - struct bch_member_rcu __rcu *members; - /* Updated by bch_sb_update():*/ struct { uuid_le uuid; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index ac3b8b458f44..f4c2f275bf78 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -788,7 +788,7 @@ LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); #endif enum bch_member_state { - BCH_MEMBER_STATE_ACTIVE = 0, + BCH_MEMBER_STATE_RW = 0, BCH_MEMBER_STATE_RO = 1, BCH_MEMBER_STATE_FAILED = 2, BCH_MEMBER_STATE_SPARE = 3, diff --git a/fs/bcachefs/blockdev.c b/fs/bcachefs/blockdev.c index 5da771e1158c..a4522ad2836f 100644 --- a/fs/bcachefs/blockdev.c +++ b/fs/bcachefs/blockdev.c @@ -17,6 +17,8 @@ static int bch_blockdev_major; static DEFINE_IDA(bch_blockdev_minor); static LIST_HEAD(uncached_devices); +static DEFINE_MUTEX(bch_blockdev_lock); + static struct kmem_cache *bch_search_cache; static void write_bdev_super_endio(struct bio *bio) @@ -62,21 +64,6 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) closure_return_with_destructor(cl, bch_write_bdev_super_unlock); } -bool bch_is_open_backing_dev(struct block_device *bdev) -{ - struct bch_fs *c, *tc; - struct cached_dev *dc, *t; - - list_for_each_entry_safe(c, tc, &bch_fs_list, list) - list_for_each_entry_safe(dc, t, &c->cached_devs, list) - if (dc->disk_sb.bdev == bdev) - return true; - list_for_each_entry_safe(dc, t, &uncached_devices, list) - if (dc->disk_sb.bdev == bdev) - return true; - return false; -} - static int open_dev(struct block_device *b, fmode_t mode) { struct bcache_device *d = b->bd_disk->private_data; @@ -118,8 +105,6 @@ void bch_blockdev_stop(struct bcache_device *d) static void bcache_device_unlink(struct bcache_device *d) { - lockdep_assert_held(&bch_register_lock); - if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { sysfs_remove_link(&d->c->kobj, d->name); sysfs_remove_link(&d->kobj, "cache"); @@ -141,8 +126,6 @@ static void bcache_device_link(struct bcache_device *d, struct bch_fs *c, static void bcache_device_detach(struct bcache_device *d) { - lockdep_assert_held(&bch_register_lock); - if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { mutex_lock(&d->inode_lock); bch_inode_rm(d->c, bcache_dev_inum(d)); @@ -161,8 +144,6 @@ static int bcache_device_attach(struct bcache_device *d, struct bch_fs *c) { int ret; - lockdep_assert_held(&bch_register_lock); - ret = radix_tree_insert(&c->devices, bcache_dev_inum(d), d); if (ret) { pr_err("radix_tree_insert() error for inum %llu", @@ -178,8 +159,6 @@ static int bcache_device_attach(struct bcache_device *d, struct bch_fs *c) static void bcache_device_free(struct bcache_device *d) { - lockdep_assert_held(&bch_register_lock); - pr_info("%s stopped", d->disk->disk_name); if (d->c) @@ -325,7 +304,7 @@ static void cached_dev_detach_finish(struct work_struct *w) BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(atomic_read(&dc->count)); - mutex_lock(&bch_register_lock); + mutex_lock(&bch_blockdev_lock); memset(&dc->disk_sb.sb->set_uuid, 0, 16); SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_NONE); @@ -339,7 +318,7 @@ static void cached_dev_detach_finish(struct work_struct *w) clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags); - mutex_unlock(&bch_register_lock); + mutex_unlock(&bch_blockdev_lock); pr_info("Caching disabled for %s", bdevname(dc->disk_sb.bdev, buf)); @@ -349,8 +328,6 @@ static void cached_dev_detach_finish(struct work_struct *w) void bch_cached_dev_detach(struct cached_dev *dc) { - lockdep_assert_held(&bch_register_lock); - if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) return; @@ -495,11 +472,14 @@ void bch_attach_backing_devs(struct bch_fs *c) { struct cached_dev *dc, *t; - lockdep_assert_held(&bch_register_lock); lockdep_assert_held(&c->state_lock); + mutex_lock(&bch_blockdev_lock); + list_for_each_entry_safe(dc, t, &uncached_devices, list) bch_cached_dev_attach(dc, c); + + mutex_unlock(&bch_blockdev_lock); } void bch_cached_dev_release(struct kobject *kobj) @@ -517,14 +497,14 @@ static void cached_dev_free(struct closure *cl) bch_cached_dev_writeback_stop(dc); bch_cached_dev_writeback_free(dc); - mutex_lock(&bch_register_lock); + mutex_lock(&bch_blockdev_lock); if (atomic_read(&dc->running)) bd_unlink_disk_holder(dc->disk_sb.bdev, dc->disk.disk); bcache_device_free(&dc->disk); list_del(&dc->list); - mutex_unlock(&bch_register_lock); + mutex_unlock(&bch_blockdev_lock); bch_free_super((void *) &dc->disk_sb); @@ -536,11 +516,8 @@ static void cached_dev_flush(struct closure *cl) struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); struct bcache_device *d = &dc->disk; - mutex_lock(&bch_register_lock); - bcache_device_unlink(d); - mutex_unlock(&bch_register_lock); - bch_cache_accounting_destroy(&dc->accounting); + bcache_device_unlink(d); kobject_del(&d->kobj); continue_at(cl, cached_dev_free, system_wq); @@ -652,8 +629,11 @@ const char *bch_backing_dev_register(struct bcache_superblock *sb) bdevname(dc->disk_sb.bdev, name)); list_add(&dc->list, &uncached_devices); - list_for_each_entry(c, &bch_fs_list, list) + c = bch_uuid_to_fs(dc->disk_sb.sb->set_uuid); + if (c) { bch_cached_dev_attach(dc, c); + closure_put(&c->cl); + } if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE || BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE) @@ -678,9 +658,7 @@ static void blockdev_volume_free(struct closure *cl) { struct bcache_device *d = container_of(cl, struct bcache_device, cl); - mutex_lock(&bch_register_lock); bcache_device_free(d); - mutex_unlock(&bch_register_lock); kobject_put(&d->kobj); } @@ -688,9 +666,7 @@ static void blockdev_volume_flush(struct closure *cl) { struct bcache_device *d = container_of(cl, struct bcache_device, cl); - mutex_lock(&bch_register_lock); bcache_device_unlink(d); - mutex_unlock(&bch_register_lock); kobject_del(&d->kobj); continue_at(cl, blockdev_volume_free, system_wq); } @@ -792,7 +768,7 @@ void bch_blockdevs_stop(struct bch_fs *c) struct radix_tree_iter iter; void **slot; - mutex_lock(&bch_register_lock); + mutex_lock(&bch_blockdev_lock); rcu_read_lock(); radix_tree_for_each_slot(slot, &c->devices, &iter, 0) { @@ -808,7 +784,7 @@ void bch_blockdevs_stop(struct bch_fs *c) } rcu_read_unlock(); - mutex_unlock(&bch_register_lock); + mutex_unlock(&bch_blockdev_lock); } void bch_fs_blockdev_exit(struct bch_fs *c) diff --git a/fs/bcachefs/blockdev.h b/fs/bcachefs/blockdev.h index 0062ef7d1df3..5423d77644f8 100644 --- a/fs/bcachefs/blockdev.h +++ b/fs/bcachefs/blockdev.h @@ -59,7 +59,6 @@ void bch_cached_dev_detach(struct cached_dev *); void bch_cached_dev_run(struct cached_dev *); void bch_blockdev_stop(struct bcache_device *); -bool bch_is_open_backing_dev(struct block_device *); const char *bch_backing_dev_register(struct bcache_superblock *); int bch_blockdev_volume_create(struct bch_fs *, u64); @@ -90,10 +89,6 @@ static inline void bch_cached_dev_detach(struct cached_dev *dc) {} static inline void bch_cached_dev_run(struct cached_dev *dc) {} static inline void bch_blockdev_stop(struct bcache_device *d) {} -static inline bool bch_is_open_backing_dev(struct block_device *bdev) -{ - return false; -} static inline const char *bch_backing_dev_register(struct bcache_superblock *sb) { return "not implemented"; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 9c34269736c8..7e8a3f6a17df 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -90,15 +90,13 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b, u8 bch_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k) { const struct bch_extent_ptr *ptr; - struct bch_dev *ca; u8 max_stale = 0; if (bkey_extent_is_data(k.k)) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - rcu_read_lock(); - - extent_for_each_online_device(c, e, ptr, ca) { + extent_for_each_ptr(e, ptr) { + struct bch_dev *ca = c->devs[ptr->dev]; size_t b = PTR_BUCKET_NR(ca, ptr); if (__gen_after(ca->oldest_gens[b], ptr->gen)) @@ -106,8 +104,6 @@ u8 bch_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k) max_stale = max(max_stale, ptr_stale(ca, ptr)); } - - rcu_read_unlock(); } return max_stale; @@ -254,10 +250,10 @@ static void bch_mark_allocator_buckets(struct bch_fs *c) const struct bch_extent_ptr *ptr; mutex_lock(&ob->lock); - rcu_read_lock(); - open_bucket_for_each_online_device(c, ob, ptr, ca) + open_bucket_for_each_ptr(ob, ptr) { + ca = c->devs[ptr->dev]; bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), true); - rcu_read_unlock(); + } mutex_unlock(&ob->lock); } } @@ -273,7 +269,7 @@ static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end, } while (b < end >> ca->bucket_bits); } -void bch_dev_mark_superblocks(struct bch_dev *ca) +static void bch_dev_mark_superblocks(struct bch_dev *ca) { struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; unsigned i; @@ -294,11 +290,13 @@ void bch_dev_mark_superblocks(struct bch_dev *ca) /* * Mark non btree metadata - prios, journal */ -static void bch_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca) +void bch_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca) { unsigned i; u64 b; + lockdep_assert_held(&c->sb_lock); + bch_dev_mark_superblocks(ca); spin_lock(&c->journal.lock); @@ -329,10 +327,10 @@ static void bch_mark_metadata(struct bch_fs *c) unsigned i; mutex_lock(&c->sb_lock); + gc_pos_set(c, gc_phase(GC_PHASE_SB_METADATA)); - for_each_member_device(ca, c, i) + for_each_online_member(ca, c, i) bch_mark_dev_metadata(c, ca); - mutex_unlock(&c->sb_lock); } @@ -935,14 +933,14 @@ int bch_initial_gc(struct bch_fs *c, struct list_head *journal) { enum btree_id id; - bch_mark_metadata(c); - for (id = 0; id < BTREE_ID_NR; id++) bch_initial_gc_btree(c, id); if (journal) bch_journal_mark(c, journal); + bch_mark_metadata(c); + /* * Skip past versions that might have possibly been used (as nonces), * but hadn't had their pointers written: diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 590ade2d8211..f1794fdf4378 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -13,7 +13,7 @@ int bch_initial_gc(struct bch_fs *, struct list_head *); u8 bch_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c); u8 bch_btree_mark_key_initial(struct bch_fs *, enum bkey_type, struct bkey_s_c); -void bch_dev_mark_superblocks(struct bch_dev *); +void bch_mark_dev_metadata(struct bch_fs *, struct bch_dev *); /* * For concurrent mark and sweep (with other index updates), we define a total diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index d11d72fc9f39..71478fb1cc89 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1226,7 +1226,7 @@ void bch_btree_node_read(struct bch_fs *c, struct btree *b) bch_time_stats_update(&c->btree_read_time, start_time); out: bio_put(bio); - percpu_ref_put(&pick.ca->ref); + percpu_ref_put(&pick.ca->io_ref); } int bch_btree_root_read(struct bch_fs *c, enum btree_id id, @@ -1319,7 +1319,7 @@ static void btree_node_write_endio(struct bio *bio) } if (ca) - percpu_ref_put(&ca->ref); + percpu_ref_put(&ca->io_ref); } void __bch_btree_node_write(struct bch_fs *c, struct btree *b, @@ -1336,7 +1336,6 @@ void __bch_btree_node_write(struct bch_fs *c, struct btree *b, BKEY_PADDED(key) k; struct bkey_s_extent e; struct bch_extent_ptr *ptr; - struct bch_dev *ca; struct sort_iter sort_iter; struct nonce nonce; unsigned bytes_to_write, sectors_to_write, order, bytes, u64s; @@ -1557,10 +1556,9 @@ void __bch_btree_node_write(struct bch_fs *c, struct btree *b, extent_for_each_ptr(e, ptr) ptr->offset += b->written; - rcu_read_lock(); - extent_for_each_online_device(c, e, ptr, ca) - atomic64_add(sectors_to_write, &ca->btree_sectors_written); - rcu_read_unlock(); + extent_for_each_ptr(e, ptr) + atomic64_add(sectors_to_write, + &c->devs[ptr->dev]->btree_sectors_written); b->written += sectors_to_write; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 59c68d493995..8514f5472016 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -450,8 +450,8 @@ static void bch_mark_pointer(struct bch_fs *c, { struct bucket_mark old, new; unsigned saturated; - struct bch_dev *ca; - struct bucket *g; + struct bch_dev *ca = c->devs[ptr->dev]; + struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr); u64 v; unsigned old_sectors, new_sectors; int disk_sectors, compressed_sectors; @@ -469,12 +469,6 @@ static void bch_mark_pointer(struct bch_fs *c, compressed_sectors = -__compressed_sectors(crc, old_sectors) + __compressed_sectors(crc, new_sectors); - ca = PTR_DEV(c, ptr); - if (!ca) - goto out; - - g = ca->buckets + PTR_BUCKET_NR(ca, ptr); - if (gc_will_visit) { if (journal_seq) bucket_cmpxchg(g, new, new.journal_seq = journal_seq); @@ -565,13 +559,11 @@ static void bch_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e, BUG_ON(metadata && bkey_extent_is_cached(e.k)); BUG_ON(!sectors); - rcu_read_lock(); extent_for_each_ptr_crc(e, ptr, crc) bch_mark_pointer(c, e, crc, ptr, sectors, ptr->cached ? S_CACHED : type, may_make_unavailable, stats, gc_will_visit, journal_seq); - rcu_read_unlock(); } static void __bch_mark_key(struct bch_fs *c, struct bkey_s_c k, diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index d189c72fb8ad..9a00d38a682a 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -39,14 +39,6 @@ static inline u8 bucket_gc_gen(struct bch_dev *ca, struct bucket *g) return g->mark.gen - ca->oldest_gens[r]; } -static inline struct bch_dev *PTR_DEV(const struct bch_fs *c, - const struct bch_extent_ptr *ptr) -{ - EBUG_ON(ptr->dev > rcu_dereference(c->members)->nr_devices); - - return rcu_dereference(c->devs[ptr->dev]); -} - static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, const struct bch_extent_ptr *ptr) { @@ -64,14 +56,12 @@ static inline size_t PTR_BUCKET_NR_TRACE(const struct bch_fs *c, #if 0 if (bkey_extent_is_data(&k->k)) { const struct bch_extent_ptr *ptr; - const struct bch_dev *ca; - rcu_read_lock(); - extent_for_each_online_device(c, bkey_i_to_s_c_extent(k), ptr, ca) { + extent_for_each_ptr(bkey_i_to_s_c_extent(k), ptr) { + const struct bch_dev *ca = c->devs[ptr->dev]; bucket = PTR_BUCKET_NR(ca, ptr); break; } - rcu_read_unlock(); } #endif return bucket; @@ -102,8 +92,6 @@ static inline u8 gen_after(u8 a, u8 b) /** * ptr_stale() - check if a pointer points into a bucket that has been * invalidated. - * - * Warning: PTR_DEV(c, k, ptr) must equal ca. */ static inline u8 ptr_stale(const struct bch_dev *ca, const struct bch_extent_ptr *ptr) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 4b1fd946da32..9ef8cfc64d99 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -124,7 +124,7 @@ static long bch_ioctl_disk_add(struct bch_fs *c, /* returns with ref on ca->ref */ static struct bch_dev *bch_device_lookup(struct bch_fs *c, - const char __user *dev) + const char __user *dev) { struct block_device *bdev; struct bch_dev *ca; @@ -166,7 +166,6 @@ static long bch_ioctl_disk_remove(struct bch_fs *c, ret = bch_dev_remove(c, ca, arg.flags); - percpu_ref_put(&ca->ref); return ret; } diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 7bfe73c22109..b91f53d261d2 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -88,7 +88,7 @@ void __bch_btree_verify(struct bch_fs *c, struct btree *b) bch_btree_node_read_done(c, v, pick.ca, &pick.ptr); n_sorted = c->verify_data->data; - percpu_ref_put(&pick.ca->ref); + percpu_ref_put(&pick.ca->io_ref); sorted = &n_sorted->keys; inmemory = &n_inmemory->keys; diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 48087fba967c..ba46d2d12f59 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -112,7 +112,6 @@ void bch_nonfatal_io_error_work(struct work_struct *work) struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); struct bch_fs *c = ca->fs; unsigned errors = atomic_read(&ca->io_errors); - char buf[BDEVNAME_SIZE]; bool dev; if (errors < c->error_limit) { @@ -127,9 +126,8 @@ void bch_nonfatal_io_error_work(struct work_struct *work) ? __bch_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, BCH_FORCE_IF_DEGRADED) : bch_fs_emergency_read_only(c)) - bch_err(c, - "too many IO errors on %s, setting %s RO", - bdevname(ca->disk_sb.bdev, buf), + bch_err(ca, + "too many IO errors, setting %s RO", dev ? "device" : "filesystem"); mutex_unlock(&c->state_lock); } diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index fe8e186ada1a..726b20d4434b 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -13,13 +13,6 @@ struct bch_fs; /* Error messages: */ -#define __bch_dev_error(ca, fmt, ...) \ -do { \ - char _buf[BDEVNAME_SIZE]; \ - bch_err((ca)->fs, "%s: " fmt, \ - bdevname((ca)->disk_sb.bdev, _buf), ##__VA_ARGS__); \ -} while (0) - /* * Very fatal logic/inconsistency errors: these indicate that we've majorly * screwed up at runtime, i.e. it's not likely that it was just caused by the @@ -75,7 +68,7 @@ do { \ #define bch_dev_inconsistent(ca, ...) \ do { \ - __bch_dev_error(ca, __VA_ARGS__); \ + bch_err(ca, __VA_ARGS__); \ bch_inconsistent_error((ca)->fs); \ } while (0) @@ -171,17 +164,15 @@ do { \ #define bch_dev_fatal_error(ca, ...) \ do { \ - __bch_dev_error(ca, __VA_ARGS__); \ + bch_err(ca, __VA_ARGS__); \ bch_fatal_error(c); \ } while (0) #define bch_dev_fatal_io_error(ca, fmt, ...) \ do { \ - char _buf[BDEVNAME_SIZE]; \ - \ printk_ratelimited(KERN_ERR bch_fmt((ca)->fs, \ "fatal IO error on %s for " fmt), \ - bdevname((ca)->disk_sb.bdev, _buf), ##__VA_ARGS__); \ + (ca)->name, ##__VA_ARGS__); \ bch_fatal_error((ca)->fs); \ } while (0) @@ -219,11 +210,9 @@ do { \ /* Logs message and handles the error: */ #define bch_dev_nonfatal_io_error(ca, fmt, ...) \ do { \ - char _buf[BDEVNAME_SIZE]; \ - \ printk_ratelimited(KERN_ERR bch_fmt((ca)->fs, \ "IO error on %s for " fmt), \ - bdevname((ca)->disk_sb.bdev, _buf), ##__VA_ARGS__); \ + (ca)->name, ##__VA_ARGS__); \ bch_nonfatal_io_error(ca); \ } while (0) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 731dce2ec7d5..87a68d738567 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -322,9 +322,9 @@ static bool should_drop_ptr(const struct bch_fs *c, struct bkey_s_c_extent e, const struct bch_extent_ptr *ptr) { - struct bch_dev *ca; + struct bch_dev *ca = c->devs[ptr->dev]; - return (ca = PTR_DEV(c, ptr)) && ptr_stale(ca, ptr); + return ptr_stale(ca, ptr); } static void bch_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e) @@ -332,14 +332,12 @@ static void bch_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e) struct bch_extent_ptr *ptr = &e.v->start->ptr; bool dropped = false; - rcu_read_lock(); while ((ptr = extent_ptr_next(e, ptr))) if (should_drop_ptr(c, e.c, ptr)) { __bch_extent_drop_ptr(e, ptr); dropped = true; } else ptr++; - rcu_read_unlock(); if (dropped) bch_extent_drop_redundant_crcs(e); @@ -387,30 +385,39 @@ static void bch_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) } } -static const char *extent_ptr_invalid(struct bkey_s_c_extent e, - const struct bch_member_rcu *mi, +static const char *extent_ptr_invalid(const struct bch_fs *c, + struct bkey_s_c_extent e, const struct bch_extent_ptr *ptr, - unsigned size_ondisk) + unsigned size_ondisk, + bool metadata) { const struct bch_extent_ptr *ptr2; - const struct bch_member_cpu *m = mi->m + ptr->dev; + struct bch_dev *ca; + + if (ptr->dev >= c->sb.nr_devices) + return "pointer to invalid device"; - if (ptr->dev > mi->nr_devices || !m->valid) + ca = c->devs[ptr->dev]; + if (!ca) return "pointer to invalid device"; extent_for_each_ptr(e, ptr2) if (ptr != ptr2 && ptr->dev == ptr2->dev) return "multiple pointers to same device"; - if (ptr->offset + size_ondisk > m->bucket_size * m->nbuckets) + if (ptr->offset + size_ondisk > ca->mi.bucket_size * ca->mi.nbuckets) return "offset past end of device"; - if (ptr->offset < m->bucket_size * m->first_bucket) + if (ptr->offset < ca->mi.bucket_size * ca->mi.first_bucket) return "offset before first bucket"; - if ((ptr->offset & (m->bucket_size - 1)) + size_ondisk > m->bucket_size) + if ((ptr->offset & (ca->mi.bucket_size - 1)) + + size_ondisk > ca->mi.bucket_size) return "spans multiple buckets"; + if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data)) + return "device not marked as containing data"; + return NULL; } @@ -426,7 +433,6 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf, #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - rcu_read_lock(); extent_for_each_entry(e, entry) { if (!first) p(" "); @@ -445,10 +451,11 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf, break; case BCH_EXTENT_ENTRY_ptr: ptr = entry_to_ptr(entry); + ca = c->devs[ptr->dev]; p("ptr: %u:%llu gen %u%s", ptr->dev, (u64) ptr->offset, ptr->gen, - (ca = PTR_DEV(c, ptr)) && ptr_stale(ca, ptr) + ca && ptr_stale(ca, ptr) ? " stale" : ""); break; default: @@ -459,8 +466,6 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf, first = false; } out: - rcu_read_unlock(); - if (bkey_extent_is_cached(e.k)) p(" cached"); #undef p @@ -487,27 +492,20 @@ static const char *bch_btree_ptr_invalid(const struct bch_fs *c, const union bch_extent_entry *entry; const struct bch_extent_ptr *ptr; const union bch_extent_crc *crc; - struct bch_member_rcu *mi; const char *reason; extent_for_each_entry(e, entry) if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) return "invalid extent entry type"; - mi = fs_member_info_get(c); - extent_for_each_ptr_crc(e, ptr, crc) { - reason = extent_ptr_invalid(e, mi, ptr, - c->sb.btree_node_size); - - if (reason) { - fs_member_info_put(); + reason = extent_ptr_invalid(c, e, ptr, + c->sb.btree_node_size, + true); + if (reason) return reason; - } } - fs_member_info_put(); - if (crc) return "has crc field"; @@ -532,32 +530,26 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, unsigned replicas = 0; bool bad; - rcu_read_lock(); - - extent_for_each_online_device(c, e, ptr, ca) { + extent_for_each_ptr(e, ptr) { + ca = c->devs[ptr->dev]; + g = PTR_BUCKET(ca, ptr); replicas++; - if ((ca = PTR_DEV(c, ptr))) { - g = PTR_BUCKET(ca, ptr); + err = "stale"; + if (ptr_stale(ca, ptr)) + goto err; - err = "stale"; - if (ptr_stale(ca, ptr)) - goto err; - - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - g->mark.data_type != BUCKET_BTREE; - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); + do { + seq = read_seqcount_begin(&c->gc_pos_lock); + bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && + g->mark.data_type != BUCKET_BTREE; + } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - err = "inconsistent"; - if (bad) - goto err; - } + err = "inconsistent"; + if (bad) + goto err; } - rcu_read_unlock(); - if (replicas < c->sb.meta_replicas_have) { bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); @@ -576,7 +568,6 @@ err: g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen, ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], (unsigned) g->mark.counter); - rcu_read_unlock(); } static void bch_btree_ptr_to_text(struct bch_fs *c, char *buf, @@ -603,11 +594,9 @@ bch_btree_pick_ptr(struct bch_fs *c, const struct btree *b) const union bch_extent_crc *crc; const struct bch_extent_ptr *ptr; struct extent_pick_ptr pick = { .ca = NULL }; - struct bch_dev *ca; - - rcu_read_lock(); - extent_for_each_online_device_crc(c, e, crc, ptr, ca) { + extent_for_each_ptr_crc(e, ptr, crc) { + struct bch_dev *ca = c->devs[ptr->dev]; struct btree *root = btree_node_root(c, b); if (bch_fs_inconsistent_on(crc, c, @@ -628,15 +617,16 @@ bch_btree_pick_ptr(struct bch_fs *c, const struct btree *b) if (pick.ca && pick.ca->mi.tier < ca->mi.tier) continue; + if (!percpu_ref_tryget(&ca->io_ref)) + continue; + + if (pick.ca) + percpu_ref_put(&pick.ca->io_ref); + pick.ca = ca; pick.ptr = *ptr; } - if (pick.ca) - percpu_ref_get(&pick.ca->ref); - - rcu_read_unlock(); - return pick; } @@ -1757,47 +1747,38 @@ static const char *bch_extent_invalid(const struct bch_fs *c, const union bch_extent_entry *entry; const union bch_extent_crc *crc; const struct bch_extent_ptr *ptr; - struct bch_member_rcu *mi = fs_member_info_get(c); unsigned size_ondisk = e.k->size; const char *reason; extent_for_each_entry(e, entry) { - reason = "invalid extent entry type"; if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - goto invalid; + return "invalid extent entry type"; if (extent_entry_is_crc(entry)) { crc = entry_to_crc(entry); - reason = "checksum offset + key size > uncompressed size"; if (crc_offset(crc) + e.k->size > crc_uncompressed_size(e.k, crc)) - goto invalid; + return "checksum offset + key size > uncompressed size"; size_ondisk = crc_compressed_size(e.k, crc); - reason = "invalid checksum type"; if (!bch_checksum_type_valid(c, crc_csum_type(crc))) - goto invalid; + return "invalid checksum type"; - reason = "invalid compression type"; if (crc_compression_type(crc) >= BCH_COMPRESSION_NR) - goto invalid; + return "invalid compression type"; } else { ptr = entry_to_ptr(entry); - reason = extent_ptr_invalid(e, mi, - &entry->ptr, size_ondisk); + reason = extent_ptr_invalid(c, e, &entry->ptr, + size_ondisk, false); if (reason) - goto invalid; + return reason; } } - fs_member_info_put(); return NULL; -invalid: - fs_member_info_put(); - return reason; } case BCH_RESERVATION: { @@ -1821,14 +1802,13 @@ static void bch_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, struct bkey_s_c_extent e) { const struct bch_extent_ptr *ptr; - struct bch_member_rcu *mi; struct bch_dev *ca; struct bucket *g; unsigned seq, stale; char buf[160]; bool bad; unsigned ptrs_per_tier[BCH_TIER_MAX]; - unsigned tier, replicas = 0; + unsigned replicas = 0; /* * XXX: we should be doing most/all of these checks at startup time, @@ -1841,13 +1821,11 @@ static void bch_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier)); - mi = fs_member_info_get(c); - extent_for_each_ptr(e, ptr) { + ca = c->devs[ptr->dev]; + g = PTR_BUCKET(ca, ptr); replicas++; - - if (ptr->dev >= mi->nr_devices) - goto bad_device; + ptrs_per_tier[ca->mi.tier]++; /* * If journal replay hasn't finished, we might be seeing keys @@ -1856,51 +1834,40 @@ static void bch_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) continue; - if (!mi->m[ptr->dev].valid) - goto bad_device; - - tier = mi->m[ptr->dev].tier; - ptrs_per_tier[tier]++; - stale = 0; - if ((ca = PTR_DEV(c, ptr))) { - g = PTR_BUCKET(ca, ptr); - - do { - struct bucket_mark mark; + do { + struct bucket_mark mark; - seq = read_seqcount_begin(&c->gc_pos_lock); - mark = READ_ONCE(g->mark); + seq = read_seqcount_begin(&c->gc_pos_lock); + mark = READ_ONCE(g->mark); - /* between mark and bucket gen */ - smp_rmb(); + /* between mark and bucket gen */ + smp_rmb(); - stale = ptr_stale(ca, ptr); + stale = ptr_stale(ca, ptr); - bch_fs_bug_on(stale && !ptr->cached, c, - "stale dirty pointer"); + bch_fs_bug_on(stale && !ptr->cached, c, + "stale dirty pointer"); - bch_fs_bug_on(stale > 96, c, - "key too stale: %i", - stale); + bch_fs_bug_on(stale > 96, c, + "key too stale: %i", + stale); - if (stale) - break; + if (stale) + break; - bad = (mark.data_type != BUCKET_DATA || - (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - !mark.owned_by_allocator && - !(ptr->cached - ? mark.cached_sectors - : mark.dirty_sectors))); - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); + bad = (mark.data_type != BUCKET_DATA || + (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && + !mark.owned_by_allocator && + !(ptr->cached + ? mark.cached_sectors + : mark.dirty_sectors))); + } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - if (bad) - goto bad_ptr; - } + if (bad) + goto bad_ptr; } - fs_member_info_put(); if (replicas > BCH_REPLICAS_MAX) { bch_bkey_val_to_text(c, btree_node_type(b), buf, @@ -1923,14 +1890,6 @@ static void bch_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, return; -bad_device: - bch_bkey_val_to_text(c, btree_node_type(b), buf, - sizeof(buf), e.s_c); - bch_fs_bug(c, "extent pointer to dev %u missing device: %s", - ptr->dev, buf); - fs_member_info_put(); - return; - bad_ptr: bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), e.s_c); @@ -1940,7 +1899,6 @@ bad_ptr: g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen, ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], (unsigned) g->mark.counter); - fs_member_info_put(); return; } @@ -1976,12 +1934,10 @@ static void bch_extent_to_text(struct bch_fs *c, char *buf, #undef p } -static unsigned PTR_TIER(struct bch_member_rcu *mi, +static unsigned PTR_TIER(struct bch_fs *c, const struct bch_extent_ptr *ptr) { - return ptr->dev < mi->nr_devices - ? mi->m[ptr->dev].tier - : UINT_MAX; + return c->devs[ptr->dev]->mi.tier; } static void bch_extent_crc_init(union bch_extent_crc *crc, @@ -2136,35 +2092,30 @@ void bch_extent_mark_replicas_cached(struct bch_fs *c, unsigned nr_cached) { struct bch_extent_ptr *ptr; - struct bch_member_rcu *mi; bool have_higher_tier; unsigned tier = 0; if (!nr_cached) return; - mi = fs_member_info_get(c); - do { have_higher_tier = false; extent_for_each_ptr(e, ptr) { if (!ptr->cached && - PTR_TIER(mi, ptr) == tier) { + PTR_TIER(c, ptr) == tier) { ptr->cached = true; nr_cached--; if (!nr_cached) - goto out; + return; } - if (PTR_TIER(mi, ptr) > tier) + if (PTR_TIER(c, ptr) > tier) have_higher_tier = true; } tier++; } while (have_higher_tier); -out: - fs_member_info_put(); } /* @@ -2182,7 +2133,6 @@ void bch_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k, struct bkey_s_c_extent e; const union bch_extent_crc *crc; const struct bch_extent_ptr *ptr; - struct bch_dev *ca; switch (k.k->type) { case KEY_TYPE_DELETED: @@ -2198,10 +2148,11 @@ void bch_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k, case BCH_EXTENT: case BCH_EXTENT_CACHED: e = bkey_s_c_to_extent(k); - rcu_read_lock(); ret->ca = NULL; - extent_for_each_online_device_crc(c, e, crc, ptr, ca) { + extent_for_each_ptr_crc(e, ptr, crc) { + struct bch_dev *ca = c->devs[ptr->dev]; + if (ptr_stale(ca, ptr)) continue; @@ -2213,6 +2164,12 @@ void bch_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k, ret->ca->mi.tier < ca->mi.tier)) continue; + if (!percpu_ref_tryget(&ca->io_ref)) + continue; + + if (ret->ca) + percpu_ref_put(&ret->ca->io_ref); + *ret = (struct extent_pick_ptr) { .crc = crc_to_128(e.k, crc), .ptr = *ptr, @@ -2220,12 +2177,8 @@ void bch_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k, }; } - if (ret->ca) - percpu_ref_get(&ret->ca->ref); - else if (!bkey_extent_is_cached(e.k)) + if (!ret->ca && !bkey_extent_is_cached(e.k)) ret->ca = ERR_PTR(-EIO); - - rcu_read_unlock(); return; case BCH_RESERVATION: @@ -2273,7 +2226,7 @@ static enum merge_result bch_extent_merge(struct bch_fs *c, extent_for_each_entry(el, en_l) { struct bch_extent_ptr *lp, *rp; - struct bch_member_cpu *m; + unsigned bucket_size; en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); @@ -2291,15 +2244,11 @@ static enum merge_result bch_extent_merge(struct bch_fs *c, return BCH_MERGE_NOMERGE; /* We don't allow extents to straddle buckets: */ + bucket_size = c->devs[lp->dev]->mi.bucket_size; - m = fs_member_info_get(c)->m + lp->dev; - if ((lp->offset & ~((u64) m->bucket_size - 1)) != - (rp->offset & ~((u64) m->bucket_size - 1))) { - fs_member_info_put(); + if ((lp->offset & ~((u64) bucket_size - 1)) != + (rp->offset & ~((u64) bucket_size - 1))) return BCH_MERGE_NOMERGE; - - } - fs_member_info_put(); } break; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 2d70c42a695b..db7bd4f14988 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -285,10 +285,6 @@ out: \ #define extent_for_each_ptr_crc(_e, _ptr, _crc) \ extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true) -#define extent_for_each_online_device_crc(_c, _e, _crc, _ptr, _ca) \ - extent_for_each_ptr_crc_filter(_e, _ptr, _crc, \ - ((_ca) = PTR_DEV(_c, _ptr))) - /* Iterate over pointers only, and from a given position: */ #define extent_ptr_next_filter(_e, _ptr, _filter) \ @@ -309,9 +305,6 @@ out: \ #define extent_for_each_ptr(_e, _ptr) \ extent_for_each_ptr_filter(_e, _ptr, true) -#define extent_for_each_online_device(_c, _e, _ptr, _ca) \ - extent_for_each_ptr_filter(_e, _ptr, ((_ca) = PTR_DEV(_c, _ptr))) - #define extent_ptr_prev(_e, _ptr) \ ({ \ typeof(&(_e).v->start->ptr) _p; \ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index b0dc1c142c58..f1125a32239f 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1183,33 +1183,13 @@ static int bch_sync_fs(struct super_block *sb, int wait) return bch_journal_flush(&c->journal); } -static struct bch_fs *bch_bdev_to_fs(struct block_device *bdev) -{ - struct bch_fs *c; - struct bch_dev *ca; - unsigned i; - - rcu_read_lock(); - - list_for_each_entry(c, &bch_fs_list, list) - for_each_member_device_rcu(ca, c, i) - if (ca->disk_sb.bdev == bdev) { - rcu_read_unlock(); - return c; - } - - rcu_read_unlock(); - - return NULL; -} - static struct bch_fs *bch_open_as_blockdevs(const char *_dev_name, - struct bch_opts opts) + struct bch_opts opts) { size_t nr_devs = 0, i = 0; char *dev_name, *s, **devs; struct bch_fs *c = NULL; - const char *err; + const char *err = "cannot allocate memory"; dev_name = kstrdup(_dev_name, GFP_KERNEL); if (!dev_name) @@ -1235,40 +1215,40 @@ static struct bch_fs *bch_open_as_blockdevs(const char *_dev_name, * filesystem and they all belong to the _same_ filesystem */ - mutex_lock(&bch_register_lock); - for (i = 0; i < nr_devs; i++) { struct block_device *bdev = lookup_bdev(devs[i]); struct bch_fs *c2; if (IS_ERR(bdev)) - goto err_unlock; + goto err; c2 = bch_bdev_to_fs(bdev); bdput(bdev); if (!c) c = c2; + else if (c2) + closure_put(&c2->cl); - if (c != c2) - goto err_unlock; + if (!c) + goto err; + if (c != c2) { + closure_put(&c->cl); + goto err; + } } - if (!c) - goto err_unlock; - mutex_lock(&c->state_lock); if (!bch_fs_running(c)) { mutex_unlock(&c->state_lock); + closure_put(&c->cl); err = "incomplete filesystem"; c = NULL; - goto err_unlock; + goto err; } - closure_get(&c->cl); mutex_unlock(&c->state_lock); - mutex_unlock(&bch_register_lock); } set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); @@ -1276,11 +1256,9 @@ err: kfree(devs); kfree(dev_name); + if (!c) + pr_err("bch_fs_open err %s", err); return c; -err_unlock: - mutex_unlock(&bch_register_lock); - pr_err("bch_fs_open err %s", err); - goto err; } static int bch_remount(struct super_block *sb, int *flags, char *data) @@ -1398,21 +1376,17 @@ static struct dentry *bch_mount(struct file_system_type *fs_type, sb->s_time_gran = c->sb.time_precision; c->vfs_sb = sb; sb->s_bdi = &c->bdi; + strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i) { + for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; - BUILD_BUG_ON(sizeof(sb->s_id) < BDEVNAME_SIZE); - - bdevname(bdev, sb->s_id); - - /* XXX: do we even need s_bdev? */ + /* XXX: create an anonymous device for multi device filesystems */ sb->s_bdev = bdev; sb->s_dev = bdev->bd_dev; + percpu_ref_put(&ca->io_ref); break; } - rcu_read_unlock(); if (opts.posix_acl < 0) sb->s_flags |= MS_POSIXACL; diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 9a2f9c1c683b..fbcc40427f23 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -146,14 +146,9 @@ void bch_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, wbio->c = c; extent_for_each_ptr(e, ptr) { - rcu_read_lock(); - ca = PTR_DEV(c, ptr); - if (ca) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - - if (!ca) { - bch_submit_wbio(c, wbio, ca, ptr, punt); + ca = c->devs[ptr->dev]; + if (!percpu_ref_tryget(&ca->io_ref)) { + bch_submit_wbio(c, wbio, NULL, ptr, punt); break; } @@ -365,7 +360,7 @@ static void bch_write_endio(struct bio *bio) bch_account_io_completion_time(ca, wbio->submit_time_us, REQ_OP_WRITE); if (ca) - percpu_ref_put(&ca->ref); + percpu_ref_put(&ca->io_ref); if (bio->bi_error && orig) orig->bi_error = bio->bi_error; @@ -992,7 +987,7 @@ static void bch_rbio_done(struct bch_fs *c, struct bch_read_bio *rbio) { struct bio *orig = &bch_rbio_parent(rbio)->bio; - percpu_ref_put(&rbio->ca->ref); + percpu_ref_put(&rbio->ca->io_ref); rbio->ca = NULL; if (rbio->split) { @@ -1034,7 +1029,7 @@ static void bch_read_error_maybe_retry(struct bch_fs *c, bch_rbio_done(c, rbio); return; retry: - percpu_ref_put(&rbio->ca->ref); + percpu_ref_put(&rbio->ca->io_ref); rbio->ca = NULL; spin_lock_irqsave(&c->read_retry_lock, flags); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 01c2b92f064f..109c27c88be5 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -897,6 +897,7 @@ search_done: break; out: free_pages((unsigned long) buf.data, get_order(buf.size)); + percpu_ref_put(&ca->io_ref); closure_return(cl); err: mutex_lock(&jlist->lock); @@ -974,11 +975,13 @@ int bch_journal_read(struct bch_fs *c, struct list_head *list) jlist.head = list; jlist.ret = 0; - for_each_member_device(ca, c, iter) + for_each_readable_member(ca, c, iter) { + percpu_ref_get(&ca->io_ref); closure_call(&ca->journal.read, bch_journal_read_device, system_unbound_wq, &jlist.cl); + } closure_sync(&jlist.cl); @@ -1285,8 +1288,8 @@ static int journal_entry_sectors(struct journal *j) lockdep_assert_held(&j->lock); - rcu_read_lock(); - group_for_each_dev_rcu(ca, &j->devs, i) { + spin_lock(&j->devs.lock); + group_for_each_dev(ca, &j->devs, i) { unsigned buckets_required = 0; sectors_available = min_t(unsigned, sectors_available, @@ -1317,7 +1320,7 @@ static int journal_entry_sectors(struct journal *j) nr_devs++; nr_online++; } - rcu_read_unlock(); + spin_unlock(&j->devs.lock); if (nr_online < c->opts.metadata_replicas_required) return -EROFS; @@ -1881,8 +1884,9 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja) bool ret; spin_lock(&j->lock); - ret = (ja->last_idx != ja->cur_idx && - ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk); + ret = ja->nr && + (ja->last_idx != ja->cur_idx && + ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk); spin_unlock(&j->lock); return ret; @@ -1922,9 +1926,12 @@ static void journal_reclaim_work(struct work_struct *work) * Advance last_idx to point to the oldest journal entry containing * btree node updates that have not yet been written out */ - group_for_each_dev(ca, &j->devs, iter) { + for_each_rw_member(ca, c, iter) { struct journal_device *ja = &ca->journal; + if (!ja->nr) + continue; + while (should_discard_bucket(j, ja)) { if (!reclaim_lock_held) { /* @@ -2012,7 +2019,6 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) READ_ONCE(c->opts.metadata_replicas); spin_lock(&j->lock); - rcu_read_lock(); /* * Drop any pointers to devices that have been removed, are no longer @@ -2023,13 +2029,15 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) * entry - that's why we drop pointers to devices <= current free space, * i.e. whichever device was limiting the current journal entry size. */ - extent_for_each_ptr_backwards(e, ptr) - if (!(ca = PTR_DEV(c, ptr)) || - ca->mi.state != BCH_MEMBER_STATE_ACTIVE || + extent_for_each_ptr_backwards(e, ptr) { + ca = c->devs[ptr->dev]; + + if (ca->mi.state != BCH_MEMBER_STATE_RW || ca->journal.sectors_free <= sectors) __bch_extent_drop_ptr(e, ptr); else ca->journal.sectors_free -= sectors; + } replicas = bch_extent_nr_ptrs(e.c); @@ -2051,8 +2059,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) * Pick devices for next journal write: * XXX: sort devices by free journal space? */ - for (i = 0; i < j->devs.nr; i++) { - ca = j->devs.d[i].dev; + group_for_each_dev(ca, &j->devs, i) { ja = &ca->journal; if (replicas >= replicas_want) @@ -2082,7 +2089,6 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx); } spin_unlock(&j->devs.lock); - rcu_read_unlock(); j->prev_buf_sectors = 0; spin_unlock(&j->lock); @@ -2148,7 +2154,7 @@ static void journal_write_endio(struct bio *bio) bch_journal_halt(j); closure_put(&j->io); - percpu_ref_put(&ca->ref); + percpu_ref_put(&ca->io_ref); } static void journal_write_done(struct closure *cl) @@ -2253,13 +2259,8 @@ static void journal_write(struct closure *cl) goto no_io; extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) { - rcu_read_lock(); - ca = PTR_DEV(c, ptr); - if (ca) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - - if (!ca) { + ca = c->devs[ptr->dev]; + if (!percpu_ref_tryget(&ca->io_ref)) { /* XXX: fix this */ bch_err(c, "missing device for journal write\n"); continue; @@ -2284,11 +2285,10 @@ static void journal_write(struct closure *cl) ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq); } - for_each_member_device(ca, c, i) - if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && - journal_flushes_device(ca) && + for_each_rw_member(ca, c, i) + if (journal_flushes_device(ca) && !bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) { - percpu_ref_get(&ca->ref); + percpu_ref_get(&ca->io_ref); bio = ca->journal.bio; bio_reset(bio); @@ -2631,7 +2631,8 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf) journal_entry_is_open(j), test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - group_for_each_dev_rcu(ca, &j->devs, iter) { + spin_lock(&j->devs.lock); + group_for_each_dev(ca, &j->devs, iter) { struct journal_device *ja = &ca->journal; ret += scnprintf(buf + ret, PAGE_SIZE - ret, @@ -2643,6 +2644,7 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf) ja->cur_idx, ja->bucket_seq[ja->cur_idx], ja->last_idx, ja->bucket_seq[ja->last_idx]); } + spin_unlock(&j->devs.lock); spin_unlock(&j->lock); rcu_read_unlock(); @@ -2748,19 +2750,24 @@ void bch_fs_journal_stop(struct journal *j) void bch_dev_journal_exit(struct bch_dev *ca) { + kfree(ca->journal.bio); kfree(ca->journal.buckets); kfree(ca->journal.bucket_seq); + + ca->journal.bio = NULL; + ca->journal.buckets = NULL; + ca->journal.bucket_seq = NULL; } -int bch_dev_journal_init(struct bch_dev *ca) +int bch_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) { struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = - bch_sb_get_journal(ca->disk_sb.sb); + bch_sb_get_journal(sb); unsigned i, journal_entry_pages; journal_entry_pages = - DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb), + DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb), PAGE_SECTORS); ja->nr = bch_nr_journal_buckets(journal_buckets); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 96f0b764837b..c83f81046f47 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -366,7 +366,7 @@ int bch_journal_move(struct bch_dev *); void bch_fs_journal_stop(struct journal *); void bch_dev_journal_exit(struct bch_dev *); -int bch_dev_journal_init(struct bch_dev *); +int bch_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch_fs_journal_exit(struct journal *); int bch_fs_journal_init(struct journal *, unsigned); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index ce6defe5bda1..5bd93be2fddf 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -64,7 +64,7 @@ int bch_move_data_off_device(struct bch_dev *ca) u64 seen_key_count; int ret = 0; - BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE); + BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); if (!ca->mi.has_data) return 0; @@ -163,7 +163,7 @@ static int bch_move_btree_off(struct bch_dev *ca, enum btree_id id) struct btree *b; int ret; - BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE); + BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); closure_init_stack(&cl); @@ -259,7 +259,7 @@ int bch_move_metadata_off_device(struct bch_dev *ca) unsigned i; int ret; - BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE); + BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); if (!ca->mi.has_metadata) return 0; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 25b203a142ce..a9a9d3197b6d 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -17,12 +17,7 @@ static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c, struct bch_extent_ptr ptr) { struct bch_extent_ptr *ptr2; - struct bch_member_rcu *mi; - unsigned bucket_bits; - - mi = fs_member_info_get(c); - bucket_bits = ilog2(mi->m[ptr.dev].bucket_size); - fs_member_info_put(); + unsigned bucket_bits = c->devs[ptr.dev]->bucket_bits; extent_for_each_ptr(e, ptr2) if (ptr2->dev == ptr.dev && diff --git a/fs/bcachefs/notify.c b/fs/bcachefs/notify.c index 675dc26cd9ef..1d5f626fcf5d 100644 --- a/fs/bcachefs/notify.c +++ b/fs/bcachefs/notify.c @@ -31,11 +31,10 @@ static void notify_get(struct bch_fs *c) static void notify_get_cache(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - char buf[BDEVNAME_SIZE]; notify_get(c); notify_var(c, "UUID=%pU", ca->uuid.b); - notify_var(c, "BLOCKDEV=%s", bdevname(ca->disk_sb.bdev, buf)); + notify_var(c, "BLOCKDEV=%s", ca->name); } static void notify_put(struct bch_fs *c) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 23302d44a8bb..41780d594af1 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -50,7 +50,7 @@ const char * const bch_cache_modes[] = { }; const char * const bch_dev_state[] = { - "active", + "readwrite", "readonly", "failed", "spare", diff --git a/fs/bcachefs/request.c b/fs/bcachefs/request.c index 2b9e687e742b..0646346e4667 100644 --- a/fs/bcachefs/request.c +++ b/fs/bcachefs/request.c @@ -712,14 +712,7 @@ static int cached_dev_congested(void *data, int bits) return 1; if (cached_dev_get(dc)) { - unsigned i; - struct bch_dev *ca; - - for_each_member_device(ca, d->c, i) { - q = bdev_get_queue(ca->disk_sb.bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); - } - + ret |= bch_congested(d->c, bits); cached_dev_put(dc); } @@ -802,17 +795,8 @@ static int blockdev_volume_ioctl(struct bcache_device *d, fmode_t mode, static int blockdev_volume_congested(void *data, int bits) { struct bcache_device *d = data; - struct request_queue *q; - struct bch_dev *ca; - unsigned i; - int ret = 0; - for_each_member_device(ca, d->c, i) { - q = bdev_get_queue(ca->disk_sb.bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); - } - - return ret; + return bch_congested(d->c, bits); } void bch_blockdev_volume_request_init(struct bcache_device *d) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index fd635e64f096..67c03e1932b1 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -174,7 +174,9 @@ struct bch_sb_field *bch_fs_sb_field_resize(struct bch_fs *c, if (bch_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d)) return NULL; - for_each_member_device(ca, c, i) { + /* XXX: we're not checking that offline device have enough space */ + + for_each_online_member(ca, c, i) { struct bcache_superblock *sb = &ca->disk_sb; if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { @@ -306,7 +308,7 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb) struct bch_sb *sb = disk_sb->sb; struct bch_sb_field *f; struct bch_sb_field_members *sb_mi; - struct bch_member_cpu mi; + struct bch_member_cpu mi; const char *err; u16 block_size; @@ -408,7 +410,7 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb) return err; sb_mi = bch_sb_get_members(sb); - mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx); + mi = bch_mi_to_cpu(sb_mi->members + sb->dev_idx); if (mi.nbuckets > LONG_MAX) return "Too many buckets"; @@ -434,104 +436,33 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb) /* device open: */ -static bool bch_is_open_cache(struct block_device *bdev) -{ - struct bch_fs *c; - struct bch_dev *ca; - unsigned i; - - rcu_read_lock(); - list_for_each_entry(c, &bch_fs_list, list) - for_each_member_device_rcu(ca, c, i) - if (ca->disk_sb.bdev == bdev) { - rcu_read_unlock(); - return true; - } - rcu_read_unlock(); - return false; -} - -static bool bch_is_open(struct block_device *bdev) -{ - bool ret; - - mutex_lock(&bch_register_lock); - ret = bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev); - mutex_unlock(&bch_register_lock); - - return ret; -} - static const char *bch_blkdev_open(const char *path, fmode_t mode, void *holder, struct block_device **ret) { struct block_device *bdev; - const char *err; *ret = NULL; bdev = blkdev_get_by_path(path, mode, holder); - - if (bdev == ERR_PTR(-EBUSY)) { - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) - return "device busy"; - - err = bch_is_open(bdev) - ? "device already registered" - : "device busy"; - - bdput(bdev); - return err; - } + if (bdev == ERR_PTR(-EBUSY)) + return "device busy"; if (IS_ERR(bdev)) return "failed to open device"; - bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; + if (mode & FMODE_WRITE) + bdev_get_queue(bdev)->backing_dev_info.capabilities + |= BDI_CAP_STABLE_WRITES; *ret = bdev; return NULL; } -/* Update cached mi: */ -int bch_fs_mi_update(struct bch_fs *c, struct bch_member *mi, - unsigned nr_devices) -{ - struct bch_member_rcu *new, *old; - struct bch_dev *ca; - unsigned i; - - lockdep_assert_held(&c->sb_lock); - - new = kzalloc(sizeof(struct bch_member_rcu) + - sizeof(struct bch_member_cpu) * nr_devices, - GFP_KERNEL); - if (!new) - return -ENOMEM; - - new->nr_devices = nr_devices; - - for (i = 0; i < nr_devices; i++) - new->m[i] = cache_mi_to_cpu_mi(&mi[i]); - - rcu_read_lock(); - for_each_member_device(ca, c, i) - ca->mi = new->m[i]; - rcu_read_unlock(); - - old = rcu_dereference_protected(c->members, - lockdep_is_held(&c->sb_lock)); - - rcu_assign_pointer(c->members, new); - if (old) - kfree_rcu(old, rcu); - - return 0; -} - static void bch_sb_update(struct bch_fs *c) { struct bch_sb *src = c->disk_sb; + struct bch_sb_field_members *mi = bch_sb_get_members(src); + struct bch_dev *ca; + unsigned i; lockdep_assert_held(&c->sb_lock); @@ -548,6 +479,9 @@ static void bch_sb_update(struct bch_fs *c) c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); c->sb.time_precision = le32_to_cpu(src->time_precision); + + for_each_member_device(ca, c, i) + ca->mi = bch_mi_to_cpu(mi->members + i); } /* doesn't copy member info */ @@ -586,8 +520,6 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src) int bch_sb_to_fs(struct bch_fs *c, struct bch_sb *src) { - struct bch_sb_field_members *members = - bch_sb_get_members(src); struct bch_sb_field_journal *journal_buckets = bch_sb_get_journal(src); unsigned journal_u64s = journal_buckets @@ -599,9 +531,6 @@ int bch_sb_to_fs(struct bch_fs *c, struct bch_sb *src) if (bch_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s)) return -ENOMEM; - if (bch_fs_mi_update(c, members->members, src->nr_devices)) - return -ENOMEM; - __copy_super(c->disk_sb, src); bch_sb_update(c); @@ -784,7 +713,7 @@ static void write_super_endio(struct bio *bio) bch_account_io_completion(ca); closure_put(&ca->fs->sb_write); - percpu_ref_put(&ca->ref); + percpu_ref_put(&ca->io_ref); } static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) @@ -795,6 +724,9 @@ static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) if (idx >= sb->layout.nr_superblocks) return false; + if (!percpu_ref_tryget(&ca->io_ref)) + return false; + sb->offset = sb->layout.sb_offset[idx]; SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); @@ -812,16 +744,12 @@ static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); bch_bio_map(bio, sb); - percpu_ref_get(&ca->ref); closure_bio_submit_punt(bio, &c->sb_write, c); - return true; } void bch_write_super(struct bch_fs *c) { - struct bch_sb_field_members *members = - bch_sb_get_members(c->disk_sb); struct closure *cl = &c->sb_write; struct bch_dev *ca; unsigned i, super_idx = 0; @@ -833,7 +761,7 @@ void bch_write_super(struct bch_fs *c) le64_add_cpu(&c->disk_sb->seq, 1); - for_each_member_device(ca, c, i) + for_each_online_member(ca, c, i) bch_sb_from_fs(c, ca); if (c->opts.nochanges) @@ -841,7 +769,7 @@ void bch_write_super(struct bch_fs *c) do { wrote = false; - for_each_member_device(ca, c, i) + for_each_online_member(ca, c, i) if (write_one_super(c, ca, super_idx)) wrote = true; @@ -850,7 +778,6 @@ void bch_write_super(struct bch_fs *c) } while (wrote); out: /* Make new options visible after they're persistent: */ - bch_fs_mi_update(c, members->members, c->sb.nr_devices); bch_sb_update(c); } diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index ed0338cf22a6..1a9bd3092e4c 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -83,7 +83,7 @@ static inline __u64 bset_magic(struct bch_fs *c) return __le64_to_cpu(bch_sb_magic(c) ^ BSET_MAGIC); } -static inline struct bch_member_cpu cache_mi_to_cpu_mi(struct bch_member *mi) +static inline struct bch_member_cpu bch_mi_to_cpu(struct bch_member *mi) { return (struct bch_member_cpu) { .nbuckets = le64_to_cpu(mi->nbuckets), @@ -99,8 +99,6 @@ static inline struct bch_member_cpu cache_mi_to_cpu_mi(struct bch_member *mi) }; } -int bch_fs_mi_update(struct bch_fs *, struct bch_member *, unsigned); - int bch_sb_to_fs(struct bch_fs *, struct bch_sb *); int bch_sb_from_fs(struct bch_fs *, struct bch_dev *); @@ -118,27 +116,23 @@ void bch_write_super(struct bch_fs *); void bch_check_mark_super_slowpath(struct bch_fs *, const struct bkey_i *, bool); -#define fs_member_info_get(_c) \ - (rcu_read_lock(), rcu_dereference((_c)->members)) - -#define fs_member_info_put() rcu_read_unlock() - static inline bool bch_check_super_marked(struct bch_fs *c, const struct bkey_i *k, bool meta) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); const struct bch_extent_ptr *ptr; - struct bch_member_cpu *mi = fs_member_info_get(c)->m; unsigned nr_replicas = 0; bool ret = true; extent_for_each_ptr(e, ptr) { + struct bch_dev *ca = c->devs[ptr->dev]; + if (ptr->cached) continue; if (!(meta - ? mi[ptr->dev].has_metadata - : mi[ptr->dev].has_data)) { + ? ca->mi.has_metadata + : ca->mi.has_data)) { ret = false; break; } @@ -150,8 +144,6 @@ static inline bool bch_check_super_marked(struct bch_fs *c, (meta ? c->sb.meta_replicas_have : c->sb.data_replicas_have)) ret = false; - fs_member_info_put(); - return ret; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 19c139418790..200b2b31eba0 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -62,28 +62,77 @@ static const uuid_le invalid_uuid = { }; static struct kset *bcache_kset; -struct mutex bch_register_lock; -LIST_HEAD(bch_fs_list); +static LIST_HEAD(bch_fs_list); +static DEFINE_MUTEX(bch_fs_list_lock); static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); struct workqueue_struct *bcache_io_wq; struct crypto_shash *bch_sha256; static void bch_dev_free(struct bch_dev *); -static int bch_dev_online(struct bch_dev *); +static int bch_dev_alloc(struct bch_fs *, unsigned); +static int bch_dev_sysfs_online(struct bch_dev *); +static void __bch_dev_read_only(struct bch_fs *, struct bch_dev *); -static int bch_congested_fn(void *data, int bdi_bits) +struct bch_fs *bch_bdev_to_fs(struct block_device *bdev) +{ + struct bch_fs *c; + struct bch_dev *ca; + unsigned i; + + mutex_lock(&bch_fs_list_lock); + rcu_read_lock(); + + list_for_each_entry(c, &bch_fs_list, list) + for_each_member_device_rcu(ca, c, i) + if (ca->disk_sb.bdev == bdev) { + closure_get(&c->cl); + goto found; + } + c = NULL; +found: + rcu_read_unlock(); + mutex_unlock(&bch_fs_list_lock); + + return c; +} + +static struct bch_fs *__bch_uuid_to_fs(uuid_le uuid) +{ + struct bch_fs *c; + + lockdep_assert_held(&bch_fs_list_lock); + + list_for_each_entry(c, &bch_fs_list, list) + if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le))) + return c; + + return NULL; +} + +struct bch_fs *bch_uuid_to_fs(uuid_le uuid) +{ + struct bch_fs *c; + + mutex_lock(&bch_fs_list_lock); + c = __bch_uuid_to_fs(uuid); + if (c) + closure_get(&c->cl); + mutex_unlock(&bch_fs_list_lock); + + return c; +} + +int bch_congested(struct bch_fs *c, int bdi_bits) { struct backing_dev_info *bdi; - struct bch_fs *c = data; struct bch_dev *ca; unsigned i; int ret = 0; - rcu_read_lock(); if (bdi_bits & (1 << WB_sync_congested)) { /* Reads - check all devices: */ - for_each_member_device_rcu(ca, c, i) { + for_each_readable_member(ca, c, i) { bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); if (bdi_congested(bdi, bdi_bits)) { @@ -96,7 +145,8 @@ static int bch_congested_fn(void *data, int bdi_bits) struct bch_tier *tier = READ_ONCE(c->fastest_tier); struct dev_group *grp = tier ? &tier->devs : &c->all_devs; - group_for_each_dev_rcu(ca, grp, i) { + rcu_read_lock(); + group_for_each_dev(ca, grp, i) { bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); if (bdi_congested(bdi, bdi_bits)) { @@ -104,12 +154,19 @@ static int bch_congested_fn(void *data, int bdi_bits) break; } } + rcu_read_unlock(); } - rcu_read_unlock(); return ret; } +static int bch_congested_fn(void *data, int bdi_bits) +{ + struct bch_fs *c = data; + + return bch_congested(c, bdi_bits); +} + /* Filesystem RO/RW: */ /* @@ -256,10 +313,9 @@ const char *bch_fs_read_write(struct bch_fs *c) goto out; err = "error starting allocator thread"; - for_each_member_device(ca, c, i) - if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && - bch_dev_allocator_start(ca)) { - percpu_ref_put(&ca->ref); + for_each_rw_member(ca, c, i) + if (bch_dev_allocator_start(ca)) { + percpu_ref_put(&ca->io_ref); goto err; } @@ -268,10 +324,9 @@ const char *bch_fs_read_write(struct bch_fs *c) goto err; err = "error starting moving GC thread"; - for_each_member_device(ca, c, i) - if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && - bch_moving_gc_start(ca)) { - percpu_ref_put(&ca->ref); + for_each_rw_member(ca, c, i) + if (bch_moving_gc_start(ca)) { + percpu_ref_put(&ca->io_ref); goto err; } @@ -324,7 +379,6 @@ static void bch_fs_free(struct bch_fs *c) if (c->wq) destroy_workqueue(c->wq); - kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */ free_pages((unsigned long) c->disk_sb, c->disk_sb_order); kfree(c); module_put(THIS_MODULE); @@ -353,17 +407,19 @@ static void bch_fs_offline(struct bch_fs *c) struct bch_dev *ca; unsigned i; - mutex_lock(&bch_register_lock); + mutex_lock(&bch_fs_list_lock); list_del(&c->list); - mutex_unlock(&bch_register_lock); + mutex_unlock(&bch_fs_list_lock); + + for_each_member_device(ca, c, i) + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev) + sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, + "bcache"); if (c->kobj.state_in_sysfs) kobject_del(&c->kobj); - for_each_member_device(ca, c, i) - if (ca->kobj.state_in_sysfs) - kobject_del(&ca->kobj); - bch_fs_debug_exit(c); bch_fs_chardev_exit(c); @@ -453,7 +509,6 @@ void bch_fs_stop(struct bch_fs *c) closure_sync(&c->cl); bch_fs_exit(c); - kobject_put(&c->kobj); } /* Stop, detaching from backing devices: */ @@ -468,8 +523,9 @@ void bch_fs_detach(struct bch_fs *c) static struct bch_fs *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts) { + struct bch_sb_field_members *mi; struct bch_fs *c; - unsigned iter_size, journal_entry_bytes; + unsigned i, iter_size, journal_entry_bytes; c = kzalloc(sizeof(struct bch_fs), GFP_KERNEL); if (!c) @@ -607,6 +663,12 @@ static struct bch_fs *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->bdi.congested_fn = bch_congested_fn; c->bdi.congested_data = c; + mi = bch_sb_get_members(c->disk_sb); + for (i = 0; i < c->sb.nr_devices; i++) + if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) && + bch_dev_alloc(c, i)) + goto err; + /* * Now that all allocations have succeeded, init various refcounty * things that let us shutdown: @@ -632,31 +694,19 @@ err: return NULL; } -static struct bch_fs *bch_fs_lookup(uuid_le uuid) -{ - struct bch_fs *c; - - lockdep_assert_held(&bch_register_lock); - - list_for_each_entry(c, &bch_fs_list, list) - if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le))) - return c; - - return NULL; -} - static const char *__bch_fs_online(struct bch_fs *c) { struct bch_dev *ca; + const char *err = NULL; unsigned i; int ret; - lockdep_assert_held(&bch_register_lock); + lockdep_assert_held(&bch_fs_list_lock); if (!list_empty(&c->list)) return NULL; - if (bch_fs_lookup(c->sb.uuid)) + if (__bch_uuid_to_fs(c->sb.uuid)) return "filesystem UUID already open"; ret = bch_fs_chardev_init(c); @@ -672,35 +722,33 @@ static const char *__bch_fs_online(struct bch_fs *c) bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj)) return "error creating sysfs objects"; - for_each_member_device(ca, c, i) - if (bch_dev_online(ca)) { - percpu_ref_put(&ca->ref); - return "error creating sysfs objects"; - } - mutex_lock(&c->state_lock); - if (bch_blockdev_volumes_start(c)) { - mutex_unlock(&c->state_lock); - return "can't bring up blockdev volumes"; - } + err = "error creating sysfs objects"; + __for_each_member_device(ca, c, i) + if (bch_dev_sysfs_online(ca)) + goto err; - bch_attach_backing_devs(c); + err = "can't bring up blockdev volumes"; + if (bch_blockdev_volumes_start(c)) + goto err; - mutex_unlock(&c->state_lock); + bch_attach_backing_devs(c); list_add(&c->list, &bch_fs_list); - - return 0; + err = NULL; +err: + mutex_unlock(&c->state_lock); + return err; } static const char *bch_fs_online(struct bch_fs *c) { const char *err; - mutex_lock(&bch_register_lock); + mutex_lock(&bch_fs_list_lock); err = __bch_fs_online(c); - mutex_unlock(&bch_register_lock); + mutex_unlock(&bch_fs_list_lock); return err; } @@ -719,7 +767,7 @@ static const char *__bch_fs_start(struct bch_fs *c) BUG_ON(c->state != BCH_FS_STARTING); mutex_lock(&c->sb_lock); - for_each_member_device(ca, c, i) + for_each_online_member(ca, c, i) bch_sb_from_fs(c, ca); mutex_unlock(&c->sb_lock); @@ -728,27 +776,20 @@ static const char *__bch_fs_start(struct bch_fs *c) if (ret) goto err; - pr_debug("btree_journal_read() done"); - j = &list_entry(journal.prev, struct journal_replay, list)->j; + c->prio_clock[READ].hand = le16_to_cpu(j->read_clock); + c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock); + err = "error reading priorities"; - for_each_member_device(ca, c, i) { + for_each_readable_member(ca, c, i) { ret = bch_prio_read(ca); if (ret) { - percpu_ref_put(&ca->ref); + percpu_ref_put(&ca->io_ref); goto err; } } - c->prio_clock[READ].hand = le16_to_cpu(j->read_clock); - c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock); - - for_each_member_device(ca, c, i) { - bch_recalc_min_prio(ca, READ); - bch_recalc_min_prio(ca, WRITE); - } - for (id = 0; id < BTREE_ID_NR; id++) { unsigned level; struct bkey_i *k; @@ -786,10 +827,9 @@ static const char *__bch_fs_start(struct bch_fs *c) bch_journal_start(c); err = "error starting allocator thread"; - for_each_member_device(ca, c, i) - if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && - bch_dev_allocator_start(ca)) { - percpu_ref_put(&ca->ref); + for_each_rw_member(ca, c, i) + if (bch_dev_allocator_start(ca)) { + percpu_ref_put(&ca->io_ref); goto err; } @@ -824,9 +864,9 @@ static const char *__bch_fs_start(struct bch_fs *c) bch_initial_gc(c, NULL); err = "unable to allocate journal buckets"; - for_each_member_device(ca, c, i) + for_each_rw_member(ca, c, i) if (bch_dev_journal_alloc(ca)) { - percpu_ref_put(&ca->ref); + percpu_ref_put(&ca->io_ref); goto err; } @@ -838,10 +878,9 @@ static const char *__bch_fs_start(struct bch_fs *c) bch_journal_set_replay_done(&c->journal); err = "error starting allocator thread"; - for_each_member_device(ca, c, i) - if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && - bch_dev_allocator_start(ca)) { - percpu_ref_put(&ca->ref); + for_each_rw_member(ca, c, i) + if (bch_dev_allocator_start(ca)) { + percpu_ref_put(&ca->io_ref); goto err; } @@ -888,10 +927,8 @@ recovery_done: mi = bch_sb_get_members(c->disk_sb); now = ktime_get_seconds(); - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i) + for_each_member_device(ca, c, i) mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); - rcu_read_unlock(); SET_BCH_SB_INITIALIZED(c->disk_sb, true); SET_BCH_SB_CLEAN(c->disk_sb, false); @@ -991,30 +1028,27 @@ void bch_dev_release(struct kobject *kobj) static void bch_dev_free(struct bch_dev *ca) { - struct bch_fs *c = ca->fs; unsigned i; cancel_work_sync(&ca->io_error_work); - if (c && c->kobj.state_in_sysfs) { - char buf[12]; - - sprintf(buf, "cache%u", ca->dev_idx); - sysfs_remove_link(&c->kobj, buf); - } + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev) + sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, + "bcache"); if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); bch_free_super(&ca->disk_sb); bch_dev_journal_exit(ca); + free_percpu(ca->sectors_written); bioset_exit(&ca->replica_set); free_percpu(ca->usage_percpu); free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); kfree(ca->prio_buckets); kfree(ca->bio_prio); - kfree(ca->journal.bio); vfree(ca->buckets); vfree(ca->oldest_gens); free_heap(&ca->heap); @@ -1023,46 +1057,47 @@ static void bch_dev_free(struct bch_dev *ca) for (i = 0; i < RESERVE_NR; i++) free_fifo(&ca->free[i]); + percpu_ref_exit(&ca->io_ref); percpu_ref_exit(&ca->ref); kobject_put(&ca->kobj); - - if (c) - kobject_put(&c->kobj); } -static void bch_dev_free_work(struct work_struct *work) +static void bch_dev_io_ref_release(struct percpu_ref *ref) { - struct bch_dev *ca = container_of(work, struct bch_dev, free_work); + struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); - bch_dev_free(ca); + complete(&ca->offline_complete); } -static void bch_dev_percpu_ref_release(struct percpu_ref *ref) +static void bch_dev_offline(struct bch_dev *ca) { - struct bch_dev *ca = container_of(ref, struct bch_dev, ref); + struct bch_fs *c = ca->fs; + + lockdep_assert_held(&c->state_lock); + + __bch_dev_read_only(ca->fs, ca); + + reinit_completion(&ca->offline_complete); + percpu_ref_kill(&ca->io_ref); + wait_for_completion(&ca->offline_complete); + + if (ca->kobj.state_in_sysfs) { + struct kobject *block = + &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; - schedule_work(&ca->free_work); + sysfs_remove_link(block, "bcache"); + sysfs_remove_link(&ca->kobj, "block"); + } + + bch_free_super(&ca->disk_sb); + bch_dev_journal_exit(ca); } -static void bch_dev_free_rcu(struct rcu_head *rcu) +static void bch_dev_ref_release(struct percpu_ref *ref) { - struct bch_dev *ca = container_of(rcu, struct bch_dev, free_rcu); - - /* - * This decrements the ref count to ca, and once the ref count - * is 0 (outstanding bios to the ca also incremented it and - * decrement it on completion/error), bch_dev_percpu_ref_release - * is called, and that eventually results in bch_dev_free_work - * being called, which in turn results in bch_dev_release being - * called. - * - * In particular, these functions won't be called until there are no - * bios outstanding (the per-cpu ref counts are all 0), so it - * is safe to remove the actual sysfs device at that point, - * and that can indicate success to the user. - */ + struct bch_dev *ca = container_of(ref, struct bch_dev, ref); - percpu_ref_kill(&ca->ref); + complete(&ca->stop_complete); } static void bch_dev_stop(struct bch_dev *ca) @@ -1074,26 +1109,44 @@ static void bch_dev_stop(struct bch_dev *ca) BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca); rcu_assign_pointer(c->devs[ca->dev_idx], NULL); - call_rcu(&ca->free_rcu, bch_dev_free_rcu); + synchronize_rcu(); + + reinit_completion(&ca->stop_complete); + percpu_ref_kill(&ca->ref); + wait_for_completion(&ca->stop_complete); } -static int bch_dev_online(struct bch_dev *ca) +static int bch_dev_sysfs_online(struct bch_dev *ca) { - char buf[12]; + struct bch_fs *c = ca->fs; + int ret; - sprintf(buf, "cache%u", ca->dev_idx); + if (!c->kobj.state_in_sysfs) + return 0; + + if (!ca->kobj.state_in_sysfs) { + ret = kobject_add(&ca->kobj, &ca->fs->kobj, + "dev-%u", ca->dev_idx); + if (ret) + return ret; + } - if (kobject_add(&ca->kobj, - &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, - "bcache") || - sysfs_create_link(&ca->kobj, &ca->fs->kobj, "set") || - sysfs_create_link(&ca->fs->kobj, &ca->kobj, buf)) - return -1; + if (ca->disk_sb.bdev) { + struct kobject *block = + &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; + + ret = sysfs_create_link(block, &ca->kobj, "bcache"); + if (ret) + return ret; + ret = sysfs_create_link(&ca->kobj, block, "block"); + if (ret) + return ret; + } return 0; } -static struct bch_dev *__bch_dev_alloc(struct bcache_superblock *sb) +static int bch_dev_alloc(struct bch_fs *c, unsigned dev_idx) { struct bch_member *member; size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve; @@ -1102,47 +1155,37 @@ static struct bch_dev *__bch_dev_alloc(struct bcache_superblock *sb) struct bch_dev *ca; if (bch_fs_init_fault("dev_alloc")) - return NULL; + return -ENOMEM; ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) - return NULL; - - if (percpu_ref_init(&ca->ref, bch_dev_percpu_ref_release, - 0, GFP_KERNEL)) { - kfree(ca); - return NULL; - } + return -ENOMEM; kobject_init(&ca->kobj, &bch_dev_ktype); + init_completion(&ca->stop_complete); + init_completion(&ca->offline_complete); spin_lock_init(&ca->self.lock); ca->self.nr = 1; rcu_assign_pointer(ca->self.d[0].dev, ca); - ca->dev_idx = sb->sb->dev_idx; + ca->dev_idx = dev_idx; - INIT_WORK(&ca->free_work, bch_dev_free_work); spin_lock_init(&ca->freelist_lock); spin_lock_init(&ca->prio_buckets_lock); mutex_init(&ca->heap_lock); bch_dev_moving_gc_init(ca); - ca->disk_sb = *sb; - if (sb->mode & FMODE_EXCL) - ca->disk_sb.bdev->bd_holder = ca; - memset(sb, 0, sizeof(*sb)); - INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work); if (bch_fs_init_fault("dev_alloc")) goto err; - member = bch_sb_get_members(ca->disk_sb.sb)->members + - ca->disk_sb.sb->dev_idx; + member = bch_sb_get_members(c->disk_sb)->members + dev_idx; - ca->mi = cache_mi_to_cpu_mi(member); + ca->mi = bch_mi_to_cpu(member); ca->uuid = member->uuid; ca->bucket_bits = ilog2(ca->mi.bucket_size); + scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); /* XXX: tune these */ movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7); @@ -1155,7 +1198,11 @@ static struct bch_dev *__bch_dev_alloc(struct bcache_superblock *sb) free_inc_reserve = movinggc_reserve / 2; heap_size = movinggc_reserve * 8; - if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || + if (percpu_ref_init(&ca->ref, bch_dev_ref_release, + 0, GFP_KERNEL) || + percpu_ref_init(&ca->io_ref, bch_dev_io_ref_release, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_MOVINGGC], movinggc_reserve, GFP_KERNEL) || @@ -1166,15 +1213,14 @@ static struct bch_dev *__bch_dev_alloc(struct bcache_superblock *sb) ca->mi.nbuckets)) || !(ca->buckets = vzalloc(sizeof(struct bucket) * ca->mi.nbuckets)) || - !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * + !(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) * 2, GFP_KERNEL)) || !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) || !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) || bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio)) || - !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) || - bch_dev_journal_init(ca)) + !(ca->sectors_written = alloc_percpu(*ca->sectors_written))) goto err; ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); @@ -1182,75 +1228,76 @@ static struct bch_dev *__bch_dev_alloc(struct bcache_superblock *sb) total_reserve = ca->free_inc.size; for (i = 0; i < RESERVE_NR; i++) total_reserve += ca->free[i].size; - pr_debug("%zu buckets reserved", total_reserve); ca->copygc_write_point.group = &ca->self; ca->tiering_write_point.group = &ca->self; - return ca; + ca->fs = c; + rcu_assign_pointer(c->devs[ca->dev_idx], ca); + + if (bch_dev_sysfs_online(ca)) + pr_warn("error creating sysfs objects"); + + return 0; err: bch_dev_free(ca); - return NULL; + return -ENOMEM; } -static const char *__bch_dev_add(struct bch_fs *c, struct bch_dev *ca) +static int bch_dev_online(struct bch_fs *c, struct bcache_superblock *sb) { - if (c->devs[ca->dev_idx]) - return "already have device online in this slot"; + struct bch_dev *ca; + int ret; - if (c->sb.nr_devices == 1) - bdevname(ca->disk_sb.bdev, c->name); + lockdep_assert_held(&c->sb_lock); + + if (le64_to_cpu(sb->sb->seq) > + le64_to_cpu(c->disk_sb->seq)) + bch_sb_to_fs(c, sb->sb); + + BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || + !c->devs[sb->sb->dev_idx]); + + ca = c->devs[sb->sb->dev_idx]; + if (ca->disk_sb.bdev) { + bch_err(c, "already have device online in slot %u", + sb->sb->dev_idx); + return -EINVAL; + } + + ret = bch_dev_journal_init(ca, sb->sb); + if (ret) + return ret; /* * Increase journal write timeout if flushes to this device are * expensive: */ - if (!blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev)) && + if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) && journal_flushes_device(ca)) c->journal.write_delay_ms = max(c->journal.write_delay_ms, 1000U); - kobject_get(&c->kobj); - ca->fs = c; + /* Commit: */ + ca->disk_sb = *sb; + if (sb->mode & FMODE_EXCL) + ca->disk_sb.bdev->bd_holder = ca; + memset(sb, 0, sizeof(*sb)); - kobject_get(&ca->kobj); - rcu_assign_pointer(c->devs[ca->dev_idx], ca); + if (c->sb.nr_devices == 1) + bdevname(ca->disk_sb.bdev, c->name); + bdevname(ca->disk_sb.bdev, ca->name); - if (c->kobj.state_in_sysfs && - bch_dev_online(ca)) + if (bch_dev_sysfs_online(ca)) pr_warn("error creating sysfs objects"); - return NULL; -} + lg_local_lock(&c->usage_lock); + if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA))) + bch_mark_dev_metadata(ca->fs, ca); + lg_local_unlock(&c->usage_lock); -static const char *bch_dev_alloc(struct bcache_superblock *sb, - struct bch_fs *c, - struct bch_dev **ret) -{ - struct bch_dev *ca; - const char *err; - - ca = __bch_dev_alloc(sb); - if (!ca) - return "cannot allocate memory"; - - err = __bch_dev_add(c, ca); - if (err) { - bch_dev_free(ca); - return err; - } - - mutex_lock(&c->sb_lock); - if (le64_to_cpu(ca->disk_sb.sb->seq) > - le64_to_cpu(c->disk_sb->seq)) - bch_sb_to_fs(c, ca->disk_sb.sb); - mutex_unlock(&c->sb_lock); - - if (ret) - *ret = ca; - else - kobject_put(&ca->kobj); - return NULL; + percpu_ref_reinit(&ca->io_ref); + return 0; } /* Device management: */ @@ -1304,7 +1351,7 @@ bool bch_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, { lockdep_assert_held(&c->state_lock); - if (new_state == BCH_MEMBER_STATE_ACTIVE) + if (new_state == BCH_MEMBER_STATE_RW) return true; if (ca->mi.has_data && @@ -1346,8 +1393,7 @@ static const char *__bch_dev_read_write(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); - if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) - return NULL; + BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); trace_bcache_cache_read_write(ca); @@ -1370,7 +1416,6 @@ int __bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { struct bch_sb_field_members *mi; - char buf[BDEVNAME_SIZE]; if (ca->mi.state == new_state) return 0; @@ -1378,16 +1423,14 @@ int __bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca, if (!bch_dev_state_allowed(c, ca, new_state, flags)) return -EINVAL; - if (new_state == BCH_MEMBER_STATE_ACTIVE) { + if (new_state == BCH_MEMBER_STATE_RW) { if (__bch_dev_read_write(c, ca)) return -ENOMEM; } else { __bch_dev_read_only(c, ca); } - bch_notice(c, "%s %s", - bdevname(ca->disk_sb.bdev, buf), - bch_dev_state[new_state]); + bch_notice(ca, "%s", bch_dev_state[new_state]); mutex_lock(&c->sb_lock); mi = bch_sb_get_members(c->disk_sb); @@ -1448,20 +1491,17 @@ int bch_dev_migrate_from(struct bch_fs *c, struct bch_dev *ca) static int __bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_sb_field_members *mi; - char name[BDEVNAME_SIZE]; unsigned dev_idx = ca->dev_idx; int ret; - bdevname(ca->disk_sb.bdev, name); - - if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) { - bch_err(ca->fs, "Cannot remove RW device"); + if (ca->mi.state == BCH_MEMBER_STATE_RW) { + bch_err(ca, "Cannot remove RW device"); bch_notify_dev_remove_failed(ca); return -EINVAL; } if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { - bch_err(ca->fs, "Cannot remove %s without losing data", name); + bch_err(ca, "Cannot remove without losing data"); bch_notify_dev_remove_failed(ca); return -EINVAL; } @@ -1473,7 +1513,12 @@ static int __bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) */ ret = bch_flag_data_bad(ca); if (ret) { - bch_err(c, "Remove of %s failed", name); + bch_err(ca, "Remove failed"); + return ret; + } + + if (ca->mi.has_data || ca->mi.has_metadata) { + bch_err(ca, "Can't remove, still has data"); return ret; } @@ -1489,13 +1534,9 @@ static int __bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) bch_journal_meta(&c->journal); + bch_dev_offline(ca); bch_dev_stop(ca); - - /* - * RCU barrier between dropping between c->dev and dropping from - * member info: - */ - synchronize_rcu(); + bch_dev_free(ca); /* * Free this device's slot in the bch_member array - all pointers to @@ -1517,6 +1558,7 @@ int bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) int ret; mutex_lock(&c->state_lock); + percpu_ref_put(&ca->ref); ret = __bch_dev_remove(c, ca, flags); mutex_unlock(&c->state_lock); @@ -1556,18 +1598,9 @@ int bch_dev_add(struct bch_fs *c, const char *path) saved_mi = dev_mi->members[sb.sb->dev_idx]; saved_mi.last_mount = cpu_to_le64(ktime_get_seconds()); - /* - * XXX: ditch the GC stuff, just don't remove a device until nothing is - * using its dev_idx anymore - */ - down_read(&c->gc_lock); - if (dynamic_fault("bcache:add:no_slot")) goto no_slot; - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) - goto no_slot; - mi = bch_sb_get_members(c->disk_sb); for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) if (dev_idx >= c->sb.nr_devices || @@ -1575,15 +1608,11 @@ int bch_dev_add(struct bch_fs *c, const char *path) sizeof(uuid_le))) goto have_slot; no_slot: - up_read(&c->gc_lock); - err = "no slots available in superblock"; ret = -ENOSPC; goto err_unlock; have_slot: - up_read(&c->gc_lock); - nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); u64s = (sizeof(struct bch_sb_field_members) + sizeof(struct bch_member) * nr_devices) / sizeof(u64); @@ -1604,53 +1633,44 @@ have_slot: sb.sb->dev_idx = dev_idx; sb.sb->nr_devices = nr_devices; - if (bch_fs_mi_update(c, dev_mi->members, nr_devices)) { - err = "cannot allocate memory"; - ret = -ENOMEM; - goto err_unlock; - } - /* commit new member info */ memcpy(mi, dev_mi, u64s * sizeof(u64)); c->disk_sb->nr_devices = nr_devices; c->sb.nr_devices = nr_devices; - ca = __bch_dev_alloc(&sb); - if (!ca) { + if (bch_dev_alloc(c, dev_idx)) { err = "cannot allocate memory"; ret = -ENOMEM; goto err_unlock; } - bch_dev_mark_superblocks(ca); - - err = "journal alloc failed"; - if (bch_dev_journal_alloc(ca)) + if (bch_dev_online(c, &sb)) { + err = "bch_dev_online() error"; + ret = -ENOMEM; goto err_unlock; - - err = __bch_dev_add(c, ca); - BUG_ON(err); + } bch_write_super(c); mutex_unlock(&c->sb_lock); - if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) { + ca = c->devs[dev_idx]; + if (ca->mi.state == BCH_MEMBER_STATE_RW) { + err = "journal alloc failed"; + if (bch_dev_journal_alloc(ca)) + goto err; + err = __bch_dev_read_write(c, ca); if (err) goto err; } bch_notify_dev_added(ca); - - kobject_put(&ca->kobj); mutex_unlock(&c->state_lock); return 0; err_unlock: mutex_unlock(&c->sb_lock); err: mutex_unlock(&c->state_lock); - if (ca) - bch_dev_stop(ca); bch_free_super(&sb); bch_err(c, "Unable to add device: %s", err); @@ -1708,11 +1728,14 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices, if (!c) goto err; - for (i = 0; i < nr_devices; i++) { - err = bch_dev_alloc(&sb[i], c, NULL); - if (err) + err = "bch_dev_online() error"; + mutex_lock(&c->sb_lock); + for (i = 0; i < nr_devices; i++) + if (bch_dev_online(c, &sb[i])) { + mutex_unlock(&c->sb_lock); goto err; - } + } + mutex_unlock(&c->sb_lock); err = "insufficient devices"; if (!bch_fs_may_start(c, 0)) @@ -1760,8 +1783,8 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb, if (err) return err; - mutex_lock(&bch_register_lock); - c = bch_fs_lookup(sb->sb->uuid); + mutex_lock(&bch_fs_list_lock); + c = __bch_uuid_to_fs(sb->sb->uuid); if (c) { closure_get(&c->cl); @@ -1777,9 +1800,14 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb, allocated_fs = true; } - err = bch_dev_alloc(sb, c, NULL); - if (err) + err = "bch_dev_online() error"; + + mutex_lock(&c->sb_lock); + if (bch_dev_online(c, sb)) { + mutex_unlock(&c->sb_lock); goto err; + } + mutex_unlock(&c->sb_lock); if (!c->opts.nostart && bch_fs_may_start(c, 0)) { err = __bch_fs_start(c); @@ -1792,11 +1820,11 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb, goto err; closure_put(&c->cl); - mutex_unlock(&bch_register_lock); + mutex_unlock(&bch_fs_list_lock); return NULL; err: - mutex_unlock(&bch_register_lock); + mutex_unlock(&bch_fs_list_lock); if (allocated_fs) bch_fs_stop(c); @@ -1817,9 +1845,9 @@ const char *bch_fs_open_incremental(const char *path) return err; if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) { - mutex_lock(&bch_register_lock); + mutex_lock(&bch_fs_list_lock); err = bch_backing_dev_register(&sb); - mutex_unlock(&bch_register_lock); + mutex_unlock(&bch_fs_list_lock); } else { err = __bch_fs_open_incremental(&sb, opts); } @@ -1878,7 +1906,7 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) code == SYS_POWER_OFF) { struct bch_fs *c; - mutex_lock(&bch_register_lock); + mutex_lock(&bch_fs_list_lock); if (!list_empty(&bch_fs_list)) pr_info("Setting all devices read only:"); @@ -1889,7 +1917,7 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) list_for_each_entry(c, &bch_fs_list, list) bch_fs_read_only(c); - mutex_unlock(&bch_register_lock); + mutex_unlock(&bch_fs_list_lock); } return NOTIFY_DONE; @@ -1933,7 +1961,6 @@ static int __init bcache_init(void) NULL }; - mutex_init(&bch_register_lock); register_reboot_notifier(&reboot); bkey_pack_test(); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 13fb0e6b42e3..53026cb73696 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -20,42 +20,79 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) return s & (ca->mi.bucket_size - 1); } -static inline struct bch_dev *bch_next_cache_rcu(struct bch_fs *c, - unsigned *iter) +static inline struct bch_dev *__bch_next_dev(struct bch_fs *c, unsigned *iter) { - struct bch_dev *ret = NULL; + struct bch_dev *ca = NULL; while (*iter < c->sb.nr_devices && - !(ret = rcu_dereference(c->devs[*iter]))) + !(ca = rcu_dereference_check(c->devs[*iter], + lockdep_is_held(&c->state_lock)))) (*iter)++; - return ret; + return ca; } +#define __for_each_member_device(ca, c, iter) \ + for ((iter) = 0; ((ca) = __bch_next_dev((c), &(iter))); (iter)++) + #define for_each_member_device_rcu(ca, c, iter) \ - for ((iter) = 0; ((ca) = bch_next_cache_rcu((c), &(iter))); (iter)++) + __for_each_member_device(ca, c, iter) -static inline struct bch_dev *bch_get_next_cache(struct bch_fs *c, - unsigned *iter) +static inline struct bch_dev *bch_get_next_dev(struct bch_fs *c, unsigned *iter) { - struct bch_dev *ret; + struct bch_dev *ca; rcu_read_lock(); - if ((ret = bch_next_cache_rcu(c, iter))) - percpu_ref_get(&ret->ref); + if ((ca = __bch_next_dev(c, iter))) + percpu_ref_get(&ca->ref); rcu_read_unlock(); - return ret; + return ca; } /* * If you break early, you must drop your ref on the current device */ -#define for_each_member_device(ca, c, iter) \ +#define for_each_member_device(ca, c, iter) \ for ((iter) = 0; \ - (ca = bch_get_next_cache(c, &(iter))); \ + (ca = bch_get_next_dev(c, &(iter))); \ percpu_ref_put(&ca->ref), (iter)++) +static inline struct bch_dev *bch_get_next_online_dev(struct bch_fs *c, + unsigned *iter, + int state_mask) +{ + struct bch_dev *ca; + + rcu_read_lock(); + while ((ca = __bch_next_dev(c, iter)) && + (!((1 << ca->mi.state) & state_mask) || + !percpu_ref_tryget(&ca->io_ref))) + (*iter)++; + rcu_read_unlock(); + + return ca; +} + +#define __for_each_online_member(ca, c, iter, state_mask) \ + for ((iter) = 0; \ + (ca = bch_get_next_online_dev(c, &(iter), state_mask)); \ + percpu_ref_put(&ca->io_ref), (iter)++) + +#define for_each_online_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, ~0) + +#define for_each_rw_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW) + +#define for_each_readable_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, \ + (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) + +struct bch_fs *bch_bdev_to_fs(struct block_device *); +struct bch_fs *bch_uuid_to_fs(uuid_le); +int bch_congested(struct bch_fs *, int); + void bch_dev_release(struct kobject *); bool bch_dev_state_allowed(struct bch_fs *, struct bch_dev *, @@ -84,8 +121,6 @@ const char *bch_fs_open(char * const *, unsigned, struct bch_opts, struct bch_fs **); const char *bch_fs_open_incremental(const char *path); -extern struct mutex bch_register_lock; -extern struct list_head bch_fs_list; extern struct workqueue_struct *bcache_io_wq; extern struct crypto_shash *bch_sha256; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 14675c2b721f..91897671b52d 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -206,12 +206,10 @@ SHOW(bch_cached_dev) return 0; } -STORE(__cached_dev) +STORE(bch_cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); - unsigned v = size; - struct bch_fs *c; struct kobj_uevent_env *env; #define d_strtoul(var) sysfs_strtoul(var, dc->var) @@ -228,6 +226,13 @@ STORE(__cached_dev) d_strtoi_h(sequential_cutoff); d_strtoi_h(readahead); + if (attr == &sysfs_writeback_running) + bch_writeback_queue(dc); + + if (attr == &sysfs_writeback_percent) + schedule_delayed_work(&dc->writeback_pd_update, + dc->writeback_pd_update_seconds * HZ); + if (attr == &sysfs_clear_stats) bch_cache_accounting_clear(&dc->accounting); @@ -295,17 +300,25 @@ STORE(__cached_dev) } if (attr == &sysfs_attach) { - if (uuid_parse(buf, &dc->disk_sb.sb->user_uuid)) + struct bch_fs *c; + uuid_le uuid; + int ret; + + if (uuid_parse(buf, &uuid)) return -EINVAL; - list_for_each_entry(c, &bch_fs_list, list) { - v = bch_cached_dev_attach(dc, c); - if (!v) - return size; + c = bch_uuid_to_fs(uuid); + if (!c) { + pr_err("Can't attach %s: cache set not found", buf); + return -ENOENT; } - pr_err("Can't attach %s: cache set not found", buf); - size = v; + dc->disk_sb.sb->set_uuid = uuid; + + ret = bch_cached_dev_attach(dc, c); + closure_put(&c->cl); + if (ret) + return ret; } if (attr == &sysfs_detach && dc->disk.c) @@ -317,25 +330,6 @@ STORE(__cached_dev) return size; } -STORE(bch_cached_dev) -{ - struct cached_dev *dc = container_of(kobj, struct cached_dev, - disk.kobj); - - mutex_lock(&bch_register_lock); - size = __cached_dev_store(kobj, attr, buf, size); - - if (attr == &sysfs_writeback_running) - bch_writeback_queue(dc); - - if (attr == &sysfs_writeback_percent) - schedule_delayed_work(&dc->writeback_pd_update, - dc->writeback_pd_update_seconds * HZ); - - mutex_unlock(&bch_register_lock); - return size; -} - static struct attribute *bch_cached_dev_files[] = { &sysfs_attach, &sysfs_detach, @@ -380,7 +374,7 @@ SHOW(bch_blockdev_volume) return 0; } -STORE(__bch_blockdev_volume) +STORE(bch_blockdev_volume) { struct bcache_device *d = container_of(kobj, struct bcache_device, kobj); @@ -438,7 +432,6 @@ STORE(__bch_blockdev_volume) return size; } -STORE_LOCKED(bch_blockdev_volume) static struct attribute *bch_blockdev_volume_files[] = { &sysfs_unregister, @@ -1224,7 +1217,7 @@ SHOW(bch_dev) return 0; } -STORE(__bch_dev) +STORE(bch_dev) { struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); struct bch_fs *c = ca->fs; @@ -1300,7 +1293,6 @@ STORE(__bch_dev) return size; } -STORE_LOCKED(bch_dev) static struct attribute *bch_dev_files[] = { &sysfs_uuid, diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h index 9d5845874931..02700246acaf 100644 --- a/fs/bcachefs/sysfs.h +++ b/fs/bcachefs/sysfs.h @@ -21,16 +21,6 @@ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ const char *buf, size_t size) \ -#define STORE_LOCKED(fn) \ -STORE(fn) \ -{ \ - ssize_t ret; \ - mutex_lock(&bch_register_lock); \ - ret = __ ## fn ## _store(kobj, attr, buf, size); \ - mutex_unlock(&bch_register_lock); \ - return ret; \ -} - #define __sysfs_attribute(_name, _mode) \ static struct attribute sysfs_##_name = \ { .name = #_name, .mode = _mode } diff --git a/fs/bcachefs/tier.c b/fs/bcachefs/tier.c index 1d6e06519483..b1ac13c99275 100644 --- a/fs/bcachefs/tier.c +++ b/fs/bcachefs/tier.c @@ -30,7 +30,6 @@ static bool tiering_pred(struct bch_fs *c, if (bkey_extent_is_data(k.k)) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const struct bch_extent_ptr *ptr; - struct bch_member_rcu *mi; unsigned replicas = 0; /* Make sure we have room to add a new pointer: */ @@ -38,12 +37,9 @@ static bool tiering_pred(struct bch_fs *c, BKEY_EXTENT_VAL_U64s_MAX) return false; - mi = fs_member_info_get(c); extent_for_each_ptr(e, ptr) - if (ptr->dev < mi->nr_devices && - mi->m[ptr->dev].tier >= s->tier->idx) + if (c->devs[ptr->dev]->mi.tier >= s->tier->idx) replicas++; - fs_member_info_put(); return replicas < c->opts.data_replicas; } @@ -54,7 +50,7 @@ static bool tiering_pred(struct bch_fs *c, static void tier_put_device(struct tiering_state *s) { if (s->ca) - percpu_ref_put(&s->ca->ref); + percpu_ref_put(&s->ca->io_ref); s->ca = NULL; } @@ -74,7 +70,7 @@ static void tier_next_device(struct bch_fs *c, struct tiering_state *s) if (s->tier->devs.nr) { s->ca = s->tier->devs.d[s->dev_idx].dev; - percpu_ref_get(&s->ca->ref); + percpu_ref_get(&s->ca->io_ref); } spin_unlock(&s->tier->devs.lock); } @@ -183,19 +179,19 @@ static int bch_tiering_thread(void *arg) last = atomic_long_read(&clock->now); tier_capacity = available_sectors = 0; - rcu_read_lock(); for (faster_tier = c->tiers; faster_tier != tier; faster_tier++) { - group_for_each_dev_rcu(ca, &faster_tier->devs, i) { + spin_lock(&faster_tier->devs.lock); + group_for_each_dev(ca, &faster_tier->devs, i) { tier_capacity += (ca->mi.nbuckets - ca->mi.first_bucket) << ca->bucket_bits; available_sectors += dev_buckets_available(ca) << ca->bucket_bits; } + spin_unlock(&faster_tier->devs.lock); } - rcu_read_unlock(); if (available_sectors < (tier_capacity >> 1)) break; |