diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2021-12-29 11:42:26 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2021-12-29 11:42:26 -0500 |
commit | dccca51ec74cb1841d7bc9530161f08e78b55e6e (patch) | |
tree | beef3ce6afe0a657f19f9dcccaf966f6e0e19340 | |
parent | 3f86fd9e11b14b3c4c68f47f9680e397ee255fae (diff) |
Merge with 312c3aa134 bcachefs: Fix keylist size in btree_update
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
63 files changed, 1147 insertions, 1523 deletions
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 2588812c5066..5070caf8f349 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -212,7 +212,7 @@ bch2_acl_to_xattr(struct btree_trans *trans, return xattr; } -struct posix_acl *bch2_get_acl(struct inode *vinode, int type) +struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -224,6 +224,9 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type) struct bkey_s_c k; int ret; + if (rcu) + return ERR_PTR(-ECHILD); + bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -289,7 +292,8 @@ int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, return ret == -ENOENT ? 0 : ret; } -int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type) +int bch2_set_acl(struct user_namespace *mnt_userns, + struct inode *vinode, struct posix_acl *_acl, int type) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -314,7 +318,7 @@ retry: mode = inode_u.bi_mode; if (type == ACL_TYPE_ACCESS) { - ret = posix_acl_update_mode(&inode->v, &mode, &acl); + ret = posix_acl_update_mode(mnt_userns, &inode->v, &mode, &acl); if (ret) goto btree_err; } diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h index 14cabbc91808..2d76a4897ba8 100644 --- a/fs/bcachefs/acl.h +++ b/fs/bcachefs/acl.h @@ -26,12 +26,12 @@ typedef struct { __le32 a_version; } bch_acl_header; -struct posix_acl *bch2_get_acl(struct inode *, int); +struct posix_acl *bch2_get_acl(struct inode *, int, bool); int bch2_set_acl_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct posix_acl *, int); -int bch2_set_acl(struct inode *, struct posix_acl *, int); +int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int); int bch2_acl_chmod(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, umode_t, struct posix_acl **); diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 7d8d26e8c964..2a36af5e0220 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -354,6 +354,7 @@ static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k) g = bucket(ca, k.k->p.offset); u = bch2_alloc_unpack(k); + *bucket_gen(ca, k.k->p.offset) = u.gen; g->_mark.gen = u.gen; g->_mark.data_type = u.data_type; g->_mark.dirty_sectors = u.dirty_sectors; @@ -513,6 +514,18 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, test_bit(b, ca->buckets_nouse)) return false; + if (ca->new_fs_bucket_idx) { + /* + * Device or filesystem is still being initialized, and we + * haven't fully marked superblocks & journal: + */ + if (is_superblock_bucket(ca, b)) + return false; + + if (b < ca->new_fs_bucket_idx) + return false; + } + gc_gen = bucket_gc_gen(bucket(ca, b)); ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2; @@ -581,7 +594,7 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) buckets = bucket_array(ca); ca->alloc_heap.used = 0; now = atomic64_read(&c->io_clock[READ].now); - last_seq_ondisk = c->journal.last_seq_ondisk; + last_seq_ondisk = c->journal.flushed_seq_ondisk; /* * Find buckets with lowest read priority, by building a maxheap sorted @@ -628,76 +641,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) up_read(&ca->bucket_lock); } -static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) -{ - struct bucket_array *buckets = bucket_array(ca); - struct bucket_mark m; - size_t b, start; - - if (ca->fifo_last_bucket < ca->mi.first_bucket || - ca->fifo_last_bucket >= ca->mi.nbuckets) - ca->fifo_last_bucket = ca->mi.first_bucket; - - start = ca->fifo_last_bucket; - - do { - ca->fifo_last_bucket++; - if (ca->fifo_last_bucket == ca->mi.nbuckets) - ca->fifo_last_bucket = ca->mi.first_bucket; - - b = ca->fifo_last_bucket; - m = READ_ONCE(buckets->b[b].mark); - - if (bch2_can_invalidate_bucket(ca, b, m)) { - struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; - - heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); - if (heap_full(&ca->alloc_heap)) - break; - } - - cond_resched(); - } while (ca->fifo_last_bucket != start); -} - -static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) -{ - struct bucket_array *buckets = bucket_array(ca); - struct bucket_mark m; - size_t checked, i; - - for (checked = 0; - checked < ca->mi.nbuckets / 2; - checked++) { - size_t b = bch2_rand_range(ca->mi.nbuckets - - ca->mi.first_bucket) + - ca->mi.first_bucket; - - m = READ_ONCE(buckets->b[b].mark); - - if (bch2_can_invalidate_bucket(ca, b, m)) { - struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; - - heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); - if (heap_full(&ca->alloc_heap)) - break; - } - - cond_resched(); - } - - sort(ca->alloc_heap.data, - ca->alloc_heap.used, - sizeof(ca->alloc_heap.data[0]), - bucket_idx_cmp, NULL); - - /* remove duplicates: */ - for (i = 0; i + 1 < ca->alloc_heap.used; i++) - if (ca->alloc_heap.data[i].bucket == - ca->alloc_heap.data[i + 1].bucket) - ca->alloc_heap.data[i].nr = 0; -} - static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) { size_t i, nr = 0; @@ -705,17 +648,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) ca->inc_gen_needs_gc = 0; ca->inc_gen_really_needs_gc = 0; - switch (ca->mi.replacement) { - case BCH_CACHE_REPLACEMENT_lru: - find_reclaimable_buckets_lru(c, ca); - break; - case BCH_CACHE_REPLACEMENT_fifo: - find_reclaimable_buckets_fifo(c, ca); - break; - case BCH_CACHE_REPLACEMENT_random: - find_reclaimable_buckets_random(c, ca); - break; - } + find_reclaimable_buckets_lru(c, ca); heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); @@ -725,33 +658,11 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) return nr; } -/* - * returns sequence number of most recent journal entry that updated this - * bucket: - */ -static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) -{ - if (m.journal_seq_valid) { - u64 journal_seq = atomic64_read(&c->journal.seq); - u64 bucket_seq = journal_seq; - - bucket_seq &= ~((u64) U16_MAX); - bucket_seq |= m.journal_seq; - - if (bucket_seq > journal_seq) - bucket_seq -= 1 << 16; - - return bucket_seq; - } else { - return 0; - } -} - static int bucket_invalidate_btree(struct btree_trans *trans, - struct bch_dev *ca, u64 b) + struct bch_dev *ca, u64 b, + struct bkey_alloc_unpacked *u) { struct bch_fs *c = trans->c; - struct bkey_alloc_unpacked u; struct btree_iter iter; int ret; @@ -765,16 +676,16 @@ static int bucket_invalidate_btree(struct btree_trans *trans, if (ret) goto err; - u = alloc_mem_to_key(c, &iter); + *u = alloc_mem_to_key(c, &iter); - u.gen++; - u.data_type = 0; - u.dirty_sectors = 0; - u.cached_sectors = 0; - u.read_time = atomic64_read(&c->io_clock[READ].now); - u.write_time = atomic64_read(&c->io_clock[WRITE].now); + u->gen++; + u->data_type = 0; + u->dirty_sectors = 0; + u->cached_sectors = 0; + u->read_time = atomic64_read(&c->io_clock[READ].now); + u->write_time = atomic64_read(&c->io_clock[WRITE].now); - ret = bch2_alloc_write(trans, &iter, &u, + ret = bch2_alloc_write(trans, &iter, u, BTREE_TRIGGER_BUCKET_INVALIDATE); err: bch2_trans_iter_exit(trans, &iter); @@ -784,21 +695,23 @@ err: static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq, unsigned flags) { - struct bucket *g; - struct bucket_mark m; + struct bkey_alloc_unpacked u; size_t b; int ret = 0; + /* + * If the read-only path is trying to shut down, we can't be generating + * new btree updates: + */ + if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) + return 1; + BUG_ON(!ca->alloc_heap.used || !ca->alloc_heap.data[0].nr); b = ca->alloc_heap.data[0].bucket; /* first, put on free_inc and mark as owned by allocator: */ percpu_down_read(&c->mark_lock); - g = bucket(ca, b); - m = READ_ONCE(g->mark); - - BUG_ON(m.dirty_sectors); bch2_mark_alloc_bucket(c, ca, b, true); @@ -807,37 +720,15 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, BUG_ON(!fifo_push(&ca->free_inc, b)); spin_unlock(&c->freelist_lock); - /* - * If we're not invalidating cached data, we only increment the bucket - * gen in memory here, the incremented gen will be updated in the btree - * by bch2_trans_mark_pointer(): - */ - if (!m.cached_sectors && - !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { - BUG_ON(m.data_type); - bucket_cmpxchg(g, m, m.gen++); - percpu_up_read(&c->mark_lock); - goto out; - } - percpu_up_read(&c->mark_lock); - /* - * If the read-only path is trying to shut down, we can't be generating - * new btree updates: - */ - if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { - ret = 1; - goto out; - } - ret = bch2_trans_do(c, NULL, journal_seq, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| BTREE_INSERT_JOURNAL_RESERVED| flags, - bucket_invalidate_btree(&trans, ca, b)); -out: + bucket_invalidate_btree(&trans, ca, b, &u)); + if (!ret) { /* remove from alloc_heap: */ struct alloc_heap_entry e, *top = ca->alloc_heap.data; @@ -853,7 +744,7 @@ out: * bucket (i.e. deleting the last reference) before writing to * this bucket again: */ - *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); + *journal_seq = max(*journal_seq, u.journal_seq); } else { size_t b2; @@ -1063,7 +954,7 @@ void bch2_recalc_capacity(struct bch_fs *c) lockdep_assert_held(&c->state_lock); for_each_online_member(ca, c, i) { - struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; + struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; ra_pages += bdi->ra_pages; } @@ -1133,7 +1024,7 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) ob++) { spin_lock(&ob->lock); if (ob->valid && !ob->on_partial_list && - ob->ptr.dev == ca->dev_idx) + ob->dev == ca->dev_idx) ret = true; spin_unlock(&ob->lock); } @@ -1280,22 +1171,3 @@ void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); } - -void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct open_bucket *ob; - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { - spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list) { - pr_buf(out, "%zu ref %u type %s\n", - ob - c->open_buckets, - atomic_read(&ob->pin), - bch2_data_types[ob->type]); - } - spin_unlock(&ob->lock); - } - -} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index e3cdb8bc1dd8..86b64177b3d0 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -142,6 +142,4 @@ int bch2_dev_allocator_start(struct bch_dev *); int bch2_alloc_write_all(struct bch_fs *, unsigned); void bch2_fs_allocator_background_init(struct bch_fs *); -void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); - #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 2bb107b8b0b9..0a634125dc90 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -43,9 +43,32 @@ * reference _after_ doing the index update that makes its allocation reachable. */ +static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) +{ + open_bucket_idx_t idx = ob - c->open_buckets; + open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); + + ob->hash = *slot; + *slot = idx; +} + +static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) +{ + open_bucket_idx_t idx = ob - c->open_buckets; + open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); + + while (*slot != idx) { + BUG_ON(!*slot); + slot = &c->open_buckets[*slot].hash; + } + + *slot = ob->hash; + ob->hash = 0; +} + void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); if (ob->ec) { bch2_ec_bucket_written(c, ob); @@ -55,14 +78,16 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) percpu_down_read(&c->mark_lock); spin_lock(&ob->lock); - bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false); + bch2_mark_alloc_bucket(c, ca, ob->bucket, false); ob->valid = false; - ob->type = 0; + ob->data_type = 0; spin_unlock(&ob->lock); percpu_up_read(&c->mark_lock); spin_lock(&c->freelist_lock); + bch2_open_bucket_hash_remove(c, ob); + ob->freelist = c->open_buckets_freelist; c->open_buckets_freelist = ob - c->open_buckets; @@ -81,8 +106,7 @@ void bch2_open_bucket_write_error(struct bch_fs *c, unsigned i; open_bucket_for_each(c, obs, ob, i) - if (ob->ptr.dev == dev && - ob->ec) + if (ob->dev == dev && ob->ec) bch2_ec_bucket_cancel(c, ob); } @@ -95,7 +119,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) ob = c->open_buckets + c->open_buckets_freelist; c->open_buckets_freelist = ob->freelist; atomic_set(&ob->pin, 1); - ob->type = 0; + ob->data_type = 0; c->open_buckets_nr_free--; return ob; @@ -105,8 +129,8 @@ static void open_bucket_free_unused(struct bch_fs *c, struct write_point *wp, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - bool may_realloc = wp->type == BCH_DATA_user; + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + bool may_realloc = wp->data_type == BCH_DATA_user; BUG_ON(ca->open_buckets_partial_nr > ARRAY_SIZE(ca->open_buckets_partial)); @@ -133,31 +157,28 @@ static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) struct open_bucket *ob; unsigned i; + rcu_read_lock(); open_bucket_for_each(c, obs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); - BUG_ON(ptr_stale(ca, &ob->ptr)); + BUG_ON(*bucket_gen(ca, ob->bucket) != ob->gen); } + rcu_read_unlock(); #endif } /* _only_ for allocating the journal on a new device: */ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) { - struct bucket_array *buckets; - ssize_t b; + while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { + u64 b = ca->new_fs_bucket_idx++; - rcu_read_lock(); - buckets = bucket_array(ca); - - for (b = buckets->first_bucket; b < buckets->nbuckets; b++) - if (is_available_bucket(buckets->b[b].mark) && - !buckets->b[b].mark.owned_by_allocator) - goto success; - b = -1; -success: - rcu_read_unlock(); - return b; + if (!is_superblock_bucket(ca, b) && + (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) + return b; + } + + return -1; } static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) @@ -251,15 +272,14 @@ out: ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->alloc_reserve = reserve; - ob->ptr = (struct bch_extent_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = bucket(ca, b)->mark.gen, - .offset = bucket_to_sector(ca, b), - .dev = ca->dev_idx, - }; - + ob->dev = ca->dev_idx; + ob->gen = *bucket_gen(ca, b); + ob->bucket = b; spin_unlock(&ob->lock); + ca->nr_open_buckets++; + bch2_open_bucket_hash_add(c, ob); + if (c->blocked_allocate_open_bucket) { bch2_time_stats_update( &c->times[BCH_TIME_blocked_allocate_open_bucket], @@ -274,7 +294,6 @@ out: c->blocked_allocate = 0; } - ca->nr_open_buckets++; spin_unlock(&c->freelist_lock); bch2_wake_allocator(ca); @@ -338,9 +357,9 @@ static void add_new_bucket(struct bch_fs *c, struct open_bucket *ob) { unsigned durability = - bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability; + bch_dev_bkey_exists(c, ob->dev)->mi.durability; - __clear_bit(ob->ptr.dev, devs_may_alloc->d); + __clear_bit(ob->dev, devs_may_alloc->d); *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) ? durability : 1; *have_cache |= !durability; @@ -450,13 +469,13 @@ static int bucket_alloc_from_stripe(struct bch_fs *c, continue; ob = c->open_buckets + h->s->blocks[ec_idx]; - if (ob->ptr.dev == devs_sorted.devs[i] && + if (ob->dev == devs_sorted.devs[i] && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) goto got_bucket; } goto out_put_head; got_bucket: - ca = bch_dev_bkey_exists(c, ob->ptr.dev); + ca = bch_dev_bkey_exists(c, ob->dev); ob->ec_idx = ec_idx; ob->ec = h->s; @@ -486,12 +505,12 @@ static void get_buckets_from_writepoint(struct bch_fs *c, unsigned i; open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); if (*nr_effective < nr_replicas && - test_bit(ob->ptr.dev, devs_may_alloc->d) && + test_bit(ob->dev, devs_may_alloc->d) && (ca->mi.durability || - (wp->type == BCH_DATA_user && !*have_cache)) && + (wp->data_type == BCH_DATA_user && !*have_cache)) && (ob->ec || !need_ec)) { add_new_bucket(c, ptrs, devs_may_alloc, nr_effective, have_cache, @@ -523,7 +542,7 @@ static int open_bucket_add_buckets(struct bch_fs *c, unsigned i; rcu_read_lock(); - devs = target_rw_devs(c, wp->type, target); + devs = target_rw_devs(c, wp->data_type, target); rcu_read_unlock(); /* Don't allocate from devices we already have pointers to: */ @@ -531,7 +550,7 @@ static int open_bucket_add_buckets(struct bch_fs *c, __clear_bit(devs_have->devs[i], devs.d); open_bucket_for_each(c, ptrs, ob, i) - __clear_bit(ob->ptr.dev, devs.d); + __clear_bit(ob->dev, devs.d); if (erasure_code) { if (!ec_open_bucket(c, ptrs)) { @@ -591,7 +610,7 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, unsigned i, j; open_bucket_for_each(c, obs, ob, i) { - bool drop = !ca || ob->ptr.dev == ca->dev_idx; + bool drop = !ca || ob->dev == ca->dev_idx; if (!drop && ob->ec) { mutex_lock(&ob->ec->lock); @@ -600,7 +619,7 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, continue; ob2 = c->open_buckets + ob->ec->blocks[j]; - drop |= ob2->ptr.dev == ca->dev_idx; + drop |= ob2->dev == ca->dev_idx; } mutex_unlock(&ob->ec->lock); } @@ -784,11 +803,11 @@ retry: wp = writepoint_find(c, write_point.v); - if (wp->type == BCH_DATA_user) + if (wp->data_type == BCH_DATA_user) ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; /* metadata may not allocate on cache devices: */ - if (wp->type != BCH_DATA_user) + if (wp->data_type != BCH_DATA_user) have_cache = true; if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { @@ -866,12 +885,27 @@ err: } } +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + return (struct bch_extent_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_ptr, + .gen = ob->gen, + .dev = ob->dev, + .offset = bucket_to_sector(ca, ob->bucket) + + ca->mi.bucket_size - + ob->sectors_free, + }; +} + /* * Append pointers to the space we just allocated to @k, and mark @sectors space * as allocated out of @ob */ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, - struct bkey_i *k, unsigned sectors) + struct bkey_i *k, unsigned sectors, + bool cached) { struct open_bucket *ob; @@ -881,14 +915,14 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, wp->sectors_free -= sectors; open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - struct bch_extent_ptr tmp = ob->ptr; + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); - tmp.cached = !ca->mi.durability && - wp->type == BCH_DATA_user; + ptr.cached = cached || + (!ca->mi.durability && + wp->data_type == BCH_DATA_user); - tmp.offset += ca->mi.bucket_size - ob->sectors_free; - bch2_bkey_append_ptr(k, tmp); + bch2_bkey_append_ptr(k, ptr); BUG_ON(sectors > ob->sectors_free); ob->sectors_free -= sectors; @@ -918,7 +952,7 @@ static inline void writepoint_init(struct write_point *wp, enum bch_data_type type) { mutex_init(&wp->lock); - wp->type = type; + wp->data_type = type; } void bch2_fs_allocator_foreground_init(struct bch_fs *c) @@ -955,3 +989,22 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) writepoint_hash(c, wp->write_point)); } } + +void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct open_bucket *ob; + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list) { + pr_buf(out, "%zu ref %u type %s\n", + ob - c->open_buckets, + atomic_read(&ob->pin), + bch2_data_types[ob->data_type]); + } + spin_unlock(&ob->lock); + } + +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 2e81712ba8d1..d466bda9afc8 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -85,12 +85,36 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, unsigned i; open_bucket_for_each(c, &wp->ptrs, ob, i) { - ob->type = wp->type; + ob->data_type = wp->data_type; atomic_inc(&ob->pin); ob_push(c, ptrs, ob); } } +static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c, + unsigned dev, u64 bucket) +{ + return c->open_buckets_hash + + (jhash_3words(dev, bucket, bucket >> 32, 0) & + (OPEN_BUCKETS_COUNT - 1)); +} + +static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket) +{ + open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket); + + while (slot) { + struct open_bucket *ob = &c->open_buckets[slot]; + + if (ob->dev == dev && ob->bucket == bucket) + return true; + + slot = ob->hash; + } + + return false; +} + int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, struct dev_stripe_state *, struct bch_devs_mask *, unsigned, unsigned *, bool *, enum alloc_reserve, @@ -105,8 +129,9 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *, unsigned, struct closure *); +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, - struct bkey_i *, unsigned); + struct bkey_i *, unsigned, bool); void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, @@ -127,4 +152,6 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp void bch2_fs_allocator_foreground_init(struct bch_fs *); +void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); + #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 4a1cd8b73d16..409232e3d998 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -37,24 +37,31 @@ typedef FIFO(long) alloc_fifo; #define WRITE_POINT_HASH_NR 32 #define WRITE_POINT_MAX 32 +/* + * 0 is never a valid open_bucket_idx_t: + */ typedef u16 open_bucket_idx_t; struct open_bucket { spinlock_t lock; atomic_t pin; open_bucket_idx_t freelist; + open_bucket_idx_t hash; /* * When an open bucket has an ec_stripe attached, this is the index of * the block in the stripe this open_bucket corresponds to: */ u8 ec_idx; - u8 type; + enum bch_data_type data_type:3; unsigned valid:1; unsigned on_partial_list:1; int alloc_reserve:3; + unsigned sectors_free; - struct bch_extent_ptr ptr; + u8 dev; + u8 gen; + u64 bucket; struct ec_stripe_new *ec; }; @@ -74,7 +81,7 @@ struct write_point { struct mutex lock; u64 last_used; unsigned long write_point; - enum bch_data_type type; + enum bch_data_type data_type; /* calculated based on how many pointers we're actually going to use: */ unsigned sectors_free; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index ca82f5453c69..3ada85ac09c6 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -445,6 +445,7 @@ struct bch_dev { * Or rcu_read_lock(), but only for ptr_stale(): */ struct bucket_array __rcu *buckets[2]; + struct bucket_gens *bucket_gens; unsigned long *buckets_nouse; struct rw_semaphore bucket_lock; @@ -453,6 +454,7 @@ struct bch_dev { struct bch_dev_usage __percpu *usage_gc; /* Allocator: */ + u64 new_fs_bucket_idx; struct task_struct __rcu *alloc_thread; /* @@ -634,7 +636,6 @@ struct bch_fs { u16 version; u16 version_min; - u16 encoded_extent_max; u8 nr_devices; u8 clean; @@ -705,6 +706,7 @@ struct bch_fs { struct btree_path_buf __percpu *btree_paths_bufs; struct srcu_struct btree_trans_barrier; + bool btree_trans_barrier_initialized; struct btree_key_cache btree_key_cache; @@ -748,17 +750,18 @@ struct bch_fs { /* JOURNAL SEQ BLACKLIST */ struct journal_seq_blacklist_table * journal_seq_blacklist_table; - struct work_struct journal_seq_blacklist_gc_work; /* ALLOCATOR */ spinlock_t freelist_lock; struct closure_waitlist freelist_wait; u64 blocked_allocate; u64 blocked_allocate_open_bucket; + open_bucket_idx_t open_buckets_freelist; open_bucket_idx_t open_buckets_nr_free; struct closure_waitlist open_buckets_wait; struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; struct write_point btree_write_point; struct write_point rebalance_write_point; @@ -809,7 +812,7 @@ struct bch_fs { ZSTD_parameters zstd_params; struct crypto_shash *sha256; - struct crypto_skcipher *chacha20; + struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; @@ -927,10 +930,20 @@ static inline unsigned bucket_bytes(const struct bch_dev *ca) static inline unsigned block_bytes(const struct bch_fs *c) { - return c->opts.block_size << 9; + return c->opts.block_size; +} + +static inline unsigned block_sectors(const struct bch_fs *c) +{ + return c->opts.block_size >> 9; +} + +static inline size_t btree_sectors(const struct bch_fs *c) +{ + return c->opts.btree_node_size >> 9; } -static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time) +static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) { struct timespec64 t; s32 rem; @@ -942,13 +955,13 @@ static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time return t; } -static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) +static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) { return (ts.tv_sec * c->sb.time_units_per_sec + (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; } -static inline s64 bch2_current_time(struct bch_fs *c) +static inline s64 bch2_current_time(const struct bch_fs *c) { struct timespec64 now; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 495f4d19ddcb..a053fca7886d 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1063,8 +1063,7 @@ struct bch_member { }; LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) -/* 4-10 unused, was TIER, HAS_(META)DATA */ -LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) +/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) @@ -1088,18 +1087,6 @@ enum bch_member_state { BCH_MEMBER_STATE_NR }; -#define BCH_CACHE_REPLACEMENT_POLICIES() \ - x(lru, 0) \ - x(fifo, 1) \ - x(random, 2) - -enum bch_cache_replacement_policies { -#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n, - BCH_CACHE_REPLACEMENT_POLICIES() -#undef x - BCH_CACHE_REPLACEMENT_NR -}; - struct bch_sb_field_members { struct bch_sb_field field; struct bch_member members[0]; diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 3bdaee446c18..b13563d70826 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -78,8 +78,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->data) return -ENOMEM; #ifdef __KERNEL__ - b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, - PAGE_KERNEL_EXEC); + b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); #else b->aux_data = mmap(NULL, btree_aux_data_bytes(b), PROT_READ|PROT_WRITE|PROT_EXEC, @@ -275,6 +274,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long touched = 0; unsigned long freed = 0; unsigned i, flags; + unsigned long ret = SHRINK_STOP; if (bch2_btree_shrinker_disabled) return SHRINK_STOP; @@ -283,7 +283,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (sc->gfp_mask & __GFP_FS) mutex_lock(&bc->lock); else if (!mutex_trylock(&bc->lock)) - return -1; + goto out_norestore; flags = memalloc_nofs_save(); @@ -300,13 +300,19 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, i = 0; list_for_each_entry_safe(b, t, &bc->freeable, list) { + /* + * Leave a few nodes on the freeable list, so that a btree split + * won't have to hit the system allocator: + */ + if (++i <= 3) + continue; + touched++; if (touched >= nr) break; - if (++i > 3 && - !btree_node_reclaim(c, b)) { + if (!btree_node_reclaim(c, b)) { btree_node_data_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -352,8 +358,14 @@ restart: mutex_unlock(&bc->lock); out: + ret = (unsigned long) freed * btree_pages(c); memalloc_nofs_restore(flags); - return (unsigned long) freed * btree_pages(c); +out_norestore: + trace_btree_cache_scan(sc->nr_to_scan, + sc->nr_to_scan / btree_pages(c), + btree_cache_can_free(bc), + ret); + return ret; } static unsigned long bch2_btree_cache_count(struct shrinker *shrink, @@ -769,16 +781,17 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * EBUG_ON(level >= BTREE_MAX_DEPTH); - if (c->opts.btree_node_mem_ptr_optimization) { - b = btree_node_mem_ptr(k); - /* - * Check b->hash_val _before_ calling btree_node_lock() - this - * might not be the node we want anymore, and trying to lock the - * wrong node could cause an unneccessary transaction restart: - */ - if (b && b->hash_val == btree_ptr_hash_val(k)) + b = btree_node_mem_ptr(k); + + /* + * Check b->hash_val _before_ calling btree_node_lock() - this might not + * be the node we want anymore, and trying to lock the wrong node could + * cause an unneccessary transaction restart: + */ + if (likely(c->opts.btree_node_mem_ptr_optimization && + b && + b->hash_val == btree_ptr_hash_val(k))) goto lock_node; - } retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 402cec1802bc..f7e10986f317 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -71,7 +71,7 @@ static inline bool btree_node_hashed(struct btree *b) static inline size_t btree_bytes(struct bch_fs *c) { - return c->opts.btree_node_size << 9; + return c->opts.btree_node_size; } static inline size_t btree_max_u64s(struct bch_fs *c) @@ -86,7 +86,7 @@ static inline size_t btree_pages(struct bch_fs *c) static inline unsigned btree_blocks(struct bch_fs *c) { - return c->opts.btree_node_size >> c->block_bits; + return btree_sectors(c) >> c->block_bits; } #define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 91c69a9f96ae..d1883701afc3 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -170,10 +170,10 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i); - if (ret) { - kfree(new); + kfree(new); + + if (ret) return ret; - } bch2_btree_node_drop_keys_outside_node(b); @@ -199,10 +199,10 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i); - if (ret) { - kfree(new); + kfree(new); + + if (ret) return ret; - } bch2_btree_node_drop_keys_outside_node(b); @@ -504,8 +504,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, */ bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_BUCKET(ca, &p.ptr, true); - struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + struct bucket *g2 = PTR_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); if (fsck_err_on(!g->gen_valid, c, @@ -643,14 +643,14 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, true); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); ptr->gen = g->mark.gen; } } else { bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, true); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); (ptr->cached && @@ -691,9 +691,9 @@ found: } ret = bch2_journal_key_insert(c, btree_id, level, new); - if (ret) - kfree(new); - else + kfree(new); + + if (!ret) *k = bkey_i_to_s_c(new); } fsck_err: @@ -737,7 +737,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, ptrs = bch2_bkey_ptrs_c(*k); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, true); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); if (gen_after(g->oldest_gen, ptr->gen)) g->oldest_gen = ptr->gen; @@ -1056,23 +1056,13 @@ static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, } while (start < end); } -void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, - unsigned flags) +static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, + unsigned flags) { struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; unsigned i; u64 b; - /* - * This conditional is kind of gross, but we may be called from the - * device add path, before the new device has actually been added to the - * running filesystem: - */ - if (c) { - lockdep_assert_held(&c->sb_lock); - percpu_down_read(&c->mark_lock); - } - for (i = 0; i < layout->nr_superblocks; i++) { u64 offset = le64_to_cpu(layout->sb_offset[i]); @@ -1091,9 +1081,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), flags); } - - if (c) - percpu_up_read(&c->mark_lock); } static void bch2_mark_superblocks(struct bch_fs *c) @@ -1283,7 +1270,6 @@ static int bch2_gc_start(struct bch_fs *c, { struct bch_dev *ca = NULL; unsigned i; - int ret; BUG_ON(c->usage_gc); @@ -1315,12 +1301,6 @@ static int bch2_gc_start(struct bch_fs *c, } } - ret = bch2_ec_mem_alloc(c, true); - if (ret) { - bch_err(c, "error allocating ec gc mem"); - return ret; - } - percpu_down_write(&c->mark_lock); /* @@ -1403,8 +1383,7 @@ static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans, } ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new); - if (ret) - kfree(new); + kfree(new); } fsck_err: return ret; @@ -1529,8 +1508,7 @@ inconsistent: stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i); - if (ret) - kfree(new); + kfree(new); } fsck_err: return ret; @@ -1768,7 +1746,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) percpu_down_read(&c->mark_lock); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, false); + struct bucket *g = PTR_BUCKET(ca, ptr); if (gen_after(g->mark.gen, ptr->gen) > 16) { percpu_up_read(&c->mark_lock); @@ -1778,7 +1756,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, false); + struct bucket *g = PTR_BUCKET(ca, ptr); if (gen_after(g->gc_gen, ptr->gen)) g->gc_gen = ptr->gen; diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 59dfb069e699..0665f5941fcc 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -8,7 +8,6 @@ int bch2_gc(struct bch_fs *, bool, bool); int bch2_gc_gens(struct bch_fs *); void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); -void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); /* * For concurrent mark and sweep (with other index updates), we define a total diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 2c2ec614892b..1455dc787190 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -33,6 +33,8 @@ void bch2_btree_node_io_unlock(struct btree *b) void bch2_btree_node_io_lock(struct btree *b) { + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, TASK_UNINTERRUPTIBLE); } @@ -51,12 +53,16 @@ void __bch2_btree_node_wait_on_write(struct btree *b) void bch2_btree_node_wait_on_read(struct btree *b) { + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, TASK_UNINTERRUPTIBLE); } void bch2_btree_node_wait_on_write(struct btree *b) { + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, TASK_UNINTERRUPTIBLE); } @@ -681,7 +687,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BTREE_ERR_FATAL, c, ca, b, i, "BSET_SEPARATE_WHITEOUTS no longer supported"); - if (btree_err_on(offset + sectors > c->opts.btree_node_size, + if (btree_err_on(offset + sectors > btree_sectors(c), BTREE_ERR_FIXABLE, c, ca, b, i, "bset past end of btree node")) { i->u64s = 0; @@ -895,7 +901,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, b->data->keys.seq, bp->seq); } - while (b->written < (ptr_written ?: c->opts.btree_node_size)) { + while (b->written < (ptr_written ?: btree_sectors(c))) { unsigned sectors, whiteout_u64s = 0; struct nonce nonce; struct bch_csum csum; @@ -1204,7 +1210,7 @@ static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) if (le64_to_cpu(bn->magic) != bset_magic(c)) return 0; - while (offset < c->opts.btree_node_size) { + while (offset < btree_sectors(c)) { if (!offset) { offset += vstruct_sectors(bn, c->block_bits); } else { @@ -1226,7 +1232,7 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void * if (!offset) return false; - while (offset < c->opts.btree_node_size) { + while (offset < btree_sectors(c)) { bne = data + (offset << 9); if (bne->keys.seq == bn->keys.seq) return true; @@ -1296,7 +1302,7 @@ fsck_err: if (ra->err[i]) continue; - while (offset < c->opts.btree_node_size) { + while (offset < btree_sectors(c)) { if (!offset) { sectors = vstruct_sectors(bn, c->block_bits); } else { @@ -1313,7 +1319,7 @@ fsck_err: offset += sectors; } - while (offset < c->opts.btree_node_size) { + while (offset < btree_sectors(c)) { bne = ra->buf[i] + (offset << 9); if (bne->keys.seq == bn->keys.seq) { if (!gap) @@ -1791,8 +1797,8 @@ do_write: BUG_ON(btree_node_fake(b)); BUG_ON((b->will_make_reachable != 0) != !b->written); - BUG_ON(b->written >= c->opts.btree_node_size); - BUG_ON(b->written & (c->opts.block_size - 1)); + BUG_ON(b->written >= btree_sectors(c)); + BUG_ON(b->written & (block_sectors(c) - 1)); BUG_ON(bset_written(b, btree_bset_last(b))); BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); @@ -1865,7 +1871,7 @@ do_write: memset(data + bytes_to_write, 0, (sectors_to_write << 9) - bytes_to_write); - BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); + BUG_ON(b->written + sectors_to_write > btree_sectors(c)); BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); BUG_ON(i->seq != b->data->keys.seq); diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index f11a2e96227b..0f20224e2a77 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -122,7 +122,7 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 4fc77dff1b17..65ab2cd64dde 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -561,6 +561,8 @@ void bch2_trans_unlock(struct btree_trans *trans) trans_for_each_path(trans, path) __bch2_btree_path_unlock(path); + + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); } /* Btree iterator: */ @@ -744,6 +746,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k k.k->p.snapshot)); bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, + BTREE_ITER_NOPRESERVE| BTREE_ITER_ALL_SNAPSHOTS); prev = bch2_btree_iter_prev(©); if (!prev.k) @@ -1816,12 +1819,14 @@ static struct btree_path *btree_path_alloc(struct btree_trans *trans, return path; } -struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, +struct btree_path *bch2_path_get(struct btree_trans *trans, enum btree_id btree_id, struct bpos pos, unsigned locks_want, unsigned level, - bool intent, unsigned long ip) + unsigned flags, unsigned long ip) { struct btree_path *path, *path_pos = NULL; + bool cached = flags & BTREE_ITER_CACHED; + bool intent = flags & BTREE_ITER_INTENT; int i; BUG_ON(trans->restarted); @@ -1843,7 +1848,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, path_pos->level == level) { __btree_path_get(path_pos, intent); path = btree_path_set_pos(trans, path_pos, pos, intent, ip); - path->preserve = true; } else { path = btree_path_alloc(trans, path_pos); path_pos = NULL; @@ -1852,7 +1856,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, path->pos = pos; path->btree_id = btree_id; path->cached = cached; - path->preserve = true; path->uptodate = BTREE_ITER_NEED_TRAVERSE; path->should_be_locked = false; path->level = level; @@ -1867,6 +1870,9 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, btree_trans_verify_sorted(trans); } + if (!(flags & BTREE_ITER_NOPRESERVE)) + path->preserve = true; + if (path->intent_ref) locks_want = max(locks_want, level + 1); @@ -2623,13 +2629,8 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, iter->ip_allocated = ip; #endif - iter->path = bch2_path_get(trans, - flags & BTREE_ITER_CACHED, - btree_id, - iter->pos, - locks_want, - depth, - flags & BTREE_ITER_INTENT, ip); + iter->path = bch2_path_get(trans, btree_id, iter->pos, + locks_want, depth, flags, ip); } void bch2_trans_iter_init(struct btree_trans *trans, @@ -2956,22 +2957,27 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) void bch2_fs_btree_iter_exit(struct bch_fs *c) { + if (c->btree_trans_barrier_initialized) + cleanup_srcu_struct(&c->btree_trans_barrier); mempool_exit(&c->btree_trans_mem_pool); mempool_exit(&c->btree_paths_pool); - cleanup_srcu_struct(&c->btree_trans_barrier); } int bch2_fs_btree_iter_init(struct bch_fs *c) { unsigned nr = BTREE_ITER_MAX; + int ret; INIT_LIST_HEAD(&c->btree_trans_list); mutex_init(&c->btree_trans_lock); - return init_srcu_struct(&c->btree_trans_barrier) ?: - mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, + ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, sizeof(struct btree_path) * nr + sizeof(struct btree_insert_entry) * nr) ?: mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, - BTREE_TRANS_MEM_MAX); + BTREE_TRANS_MEM_MAX) ?: + init_srcu_struct(&c->btree_trans_barrier); + if (!ret) + c->btree_trans_barrier_initialized = true; + return ret; } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 75767f148a11..4c903b9dd716 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -134,10 +134,9 @@ bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, bool, unsigned long); int __must_check bch2_btree_path_traverse(struct btree_trans *, struct btree_path *, unsigned); -struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id, - struct bpos, unsigned, unsigned, bool, - unsigned long); -struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); +struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, + unsigned, unsigned, unsigned, unsigned long); +inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); #ifdef CONFIG_BCACHEFS_DEBUG void bch2_trans_verify_paths(struct btree_trans *); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 805c0496b7cb..230a920ae32a 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -598,7 +598,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, do { struct rhash_head *pos, *next; - pos = *rht_bucket(tbl, bc->shrink_iter); + pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); while (!rht_is_a_nulls(pos)) { next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); @@ -662,11 +662,12 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) rcu_read_lock(); tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - for (i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { - bkey_cached_evict(bc, ck); - list_add(&ck->list, &bc->freed); - } + if (tbl) + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + bkey_cached_evict(bc, ck); + list_add(&ck->list, &bc->freed); + } rcu_read_unlock(); list_for_each_entry_safe(ck, n, &bc->freed, list) { diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 22dbbe365bbe..c84bba7bcda5 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -210,6 +210,7 @@ struct btree_node_iter { #define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) #define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) #define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13) +#define BTREE_ITER_NOPRESERVE (1 << 14) enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index d895d4eff0a9..6872e56b5c41 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -223,12 +223,12 @@ retry: if (IS_ERR(wp)) return ERR_CAST(wp); - if (wp->sectors_free < c->opts.btree_node_size) { + if (wp->sectors_free < btree_sectors(c)) { struct open_bucket *ob; unsigned i; open_bucket_for_each(c, &wp->ptrs, ob, i) - if (ob->sectors_free < c->opts.btree_node_size) + if (ob->sectors_free < btree_sectors(c)) ob->sectors_free = 0; bch2_alloc_sectors_done(c, wp); @@ -236,7 +236,7 @@ retry: } bkey_btree_ptr_v2_init(&tmp.k); - bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); + bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); bch2_open_bucket_get(c, wp, &ob); bch2_alloc_sectors_done(c, wp); @@ -1029,7 +1029,7 @@ retry: } ret = bch2_disk_reservation_get(c, &as->disk_res, - nr_nodes * c->opts.btree_node_size, + nr_nodes * btree_sectors(c), c->opts.metadata_replicas, disk_res_flags); if (ret) @@ -1609,8 +1609,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, ? bpos_predecessor(b->data->min_key) : bpos_successor(b->data->max_key); - sib_path = bch2_path_get(trans, false, path->btree_id, sib_pos, - U8_MAX, level, true, _THIS_IP_); + sib_path = bch2_path_get(trans, path->btree_id, sib_pos, + U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); ret = bch2_btree_path_traverse(trans, sib_path, false); if (ret) goto err; diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index d4574161a733..8dc86fa636d6 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -82,12 +82,12 @@ struct btree_update { /* Nodes being freed: */ struct keylist old_keys; u64 _old_keys[BTREE_UPDATE_NODES_MAX * - BKEY_BTREE_PTR_VAL_U64s_MAX]; + BKEY_BTREE_PTR_U64s_MAX]; /* Nodes being added: */ struct keylist new_keys; u64 _new_keys[BTREE_UPDATE_NODES_MAX * - BKEY_BTREE_PTR_VAL_U64s_MAX]; + BKEY_BTREE_PTR_U64s_MAX]; /* New nodes, that will be made reachable by this update: */ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; @@ -218,7 +218,7 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, { ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + b->whiteout_u64s; - ssize_t total = c->opts.btree_node_size << 6; + ssize_t total = c->opts.btree_node_size >> 3; /* Always leave one extra u64 for bch2_varint_decode: */ used++; diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 95d19887bd40..1966441b1a62 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -1271,24 +1271,23 @@ err: * When deleting, check if we need to emit a whiteout (because we're overwriting * something in an ancestor snapshot) */ -static int need_whiteout_for_snapshot(struct btree_trans *trans, struct btree_iter *orig) +static int need_whiteout_for_snapshot(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos) { struct btree_iter iter; struct bkey_s_c k; - u32 snapshot = orig->pos.snapshot; + u32 snapshot = pos.snapshot; int ret; - if (!bch2_snapshot_parent(trans->c, snapshot)) + if (!bch2_snapshot_parent(trans->c, pos.snapshot)) return 0; - bch2_trans_copy_iter(&iter, orig); - iter.flags &= BTREE_ITER_FILTER_SNAPSHOTS; - iter.flags |= BTREE_ITER_ALL_SNAPSHOTS; + pos.snapshot++; - bch2_btree_iter_advance(&iter); - - for_each_btree_key_continue_norestart(iter, 0, k, ret) { - if (bkey_cmp(k.k->p, orig->pos)) + for_each_btree_key_norestart(trans, iter, btree_id, pos, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_NOPRESERVE, k, ret) { + if (bkey_cmp(k.k->p, pos)) break; if (bch2_snapshot_is_ancestor(trans->c, snapshot, @@ -1314,7 +1313,6 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); BUG_ON(bpos_cmp(k->k.p, iter->path->pos)); - BUG_ON(bpos_cmp(k->k.p, iter->pos)); n = (struct btree_insert_entry) { .flags = flags, @@ -1335,7 +1333,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter if (bkey_deleted(&n.k->k) && (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { - int ret = need_whiteout_for_snapshot(trans, iter); + int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p); if (unlikely(ret < 0)) return ret; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 762366f58519..738ce67d7a1a 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -50,7 +50,7 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, void bch2_bucket_seq_cleanup(struct bch_fs *c) { u64 journal_seq = atomic64_read(&c->journal.seq); - u16 last_seq_ondisk = c->journal.last_seq_ondisk; + u16 last_seq_ondisk = c->journal.flushed_seq_ondisk; struct bch_dev *ca; struct bucket_array *buckets; struct bucket *g; @@ -340,13 +340,6 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m) : m.data_type; } -static bool bucket_became_unavailable(struct bucket_mark old, - struct bucket_mark new) -{ - return is_available_bucket(old) && - !is_available_bucket(new); -} - static inline void account_bucket(struct bch_fs_usage *fs_usage, struct bch_dev_usage *dev_usage, enum bch_data_type type, @@ -532,19 +525,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, update_replicas_list(trans, &r.e, sectors); } -#define do_mark_fn(fn, c, pos, flags, ...) \ -({ \ - int gc, ret = 0; \ - \ - percpu_rwsem_assert_held(&c->mark_lock); \ - \ - for (gc = 0; gc < 2 && !ret; gc++) \ - if (!gc == !(flags & BTREE_TRIGGER_GC) || \ - (gc && gc_visited(c, pos))) \ - ret = fn(c, __VA_ARGS__, gc); \ - ret; \ -}) - void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator) { @@ -558,6 +538,20 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, BUG_ON(owned_by_allocator == old.owned_by_allocator); } +static inline u8 bkey_alloc_gen(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_alloc: + return bkey_s_c_to_alloc(k).v->gen; + case KEY_TYPE_alloc_v2: + return bkey_s_c_to_alloc_v2(k).v->gen; + case KEY_TYPE_alloc_v3: + return bkey_s_c_to_alloc_v3(k).v->gen; + default: + return 0; + } +} + static int bch2_mark_alloc(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) @@ -565,16 +559,13 @@ static int bch2_mark_alloc(struct btree_trans *trans, bool gc = flags & BTREE_TRIGGER_GC; u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; - struct bkey_alloc_unpacked u; + struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old); + struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new); struct bch_dev *ca; struct bucket *g; struct bucket_mark old_m, m; int ret = 0; - /* We don't do anything for deletions - do we?: */ - if (!bkey_is_alloc(new.k)) - return 0; - /* * alloc btree is read in by bch2_alloc_read, not gc: */ @@ -582,13 +573,21 @@ static int bch2_mark_alloc(struct btree_trans *trans, !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) return 0; - if (flags & BTREE_TRIGGER_INSERT) { + if ((flags & BTREE_TRIGGER_INSERT) && + !old_u.data_type != !new_u.data_type && + new.k->type == KEY_TYPE_alloc_v3) { struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v; BUG_ON(!journal_seq); - BUG_ON(new.k->type != KEY_TYPE_alloc_v3); - v->journal_seq = cpu_to_le64(journal_seq); + /* + * If the btree updates referring to a bucket weren't flushed + * before the bucket became empty again, then the we don't have + * to wait on a journal flush before we can reuse the bucket: + */ + v->journal_seq = !new_u.data_type && + bch2_journal_noflush_seq(&c->journal, journal_seq) + ? 0 : cpu_to_le64(journal_seq); } ca = bch_dev_bkey_exists(c, new.k->p.inode); @@ -597,15 +596,17 @@ static int bch2_mark_alloc(struct btree_trans *trans, return 0; percpu_down_read(&c->mark_lock); + if (!gc && new_u.gen != bkey_alloc_gen(old)) + *bucket_gen(ca, new.k->p.offset) = new_u.gen; + g = __bucket(ca, new.k->p.offset, gc); - u = bch2_alloc_unpack(new); old_m = bucket_cmpxchg(g, m, ({ - m.gen = u.gen; - m.data_type = u.data_type; - m.dirty_sectors = u.dirty_sectors; - m.cached_sectors = u.cached_sectors; - m.stripe = u.stripe != 0; + m.gen = new_u.gen; + m.data_type = new_u.data_type; + m.dirty_sectors = new_u.dirty_sectors; + m.cached_sectors = new_u.cached_sectors; + m.stripe = new_u.stripe != 0; if (journal_seq) { m.journal_seq_valid = 1; @@ -615,12 +616,12 @@ static int bch2_mark_alloc(struct btree_trans *trans, bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); - g->io_time[READ] = u.read_time; - g->io_time[WRITE] = u.write_time; - g->oldest_gen = u.oldest_gen; + g->io_time[READ] = new_u.read_time; + g->io_time[WRITE] = new_u.write_time; + g->oldest_gen = new_u.oldest_gen; g->gen_valid = 1; - g->stripe = u.stripe; - g->stripe_redundancy = u.stripe_redundancy; + g->stripe = new_u.stripe; + g->stripe_redundancy = new_u.stripe_redundancy; percpu_up_read(&c->mark_lock); /* @@ -655,17 +656,27 @@ static int bch2_mark_alloc(struct btree_trans *trans, overflow; \ }) -static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, enum bch_data_type data_type, - unsigned sectors, bool gc) +void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, enum bch_data_type data_type, + unsigned sectors, struct gc_pos pos, + unsigned flags) { - struct bucket *g = __bucket(ca, b, gc); + struct bucket *g; struct bucket_mark old, new; bool overflow; + BUG_ON(!(flags & BTREE_TRIGGER_GC)); BUG_ON(data_type != BCH_DATA_sb && data_type != BCH_DATA_journal); + /* + * Backup superblock might be past the end of our normal usable space: + */ + if (b >= ca->mi.nbuckets) + return; + + percpu_down_read(&c->mark_lock); + g = gc_bucket(ca, b); old = bucket_cmpxchg(g, new, ({ new.data_type = data_type; overflow = checked_add(new.dirty_sectors, sectors); @@ -683,32 +694,8 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, bch2_data_types[old.data_type ?: data_type], old.dirty_sectors, sectors); - if (c) - bch2_dev_usage_update(c, ca, old, new, 0, gc); - - return 0; -} - -void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, enum bch_data_type type, - unsigned sectors, struct gc_pos pos, - unsigned flags) -{ - BUG_ON(type != BCH_DATA_sb && - type != BCH_DATA_journal); - - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) - return; - - if (likely(c)) { - do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, - ca, b, type, sectors); - } else { - __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); - } + bch2_dev_usage_update(c, ca, old, new, 0, true); + percpu_up_read(&c->mark_lock); } static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) @@ -809,17 +796,18 @@ static int mark_stripe_bucket(struct btree_trans *trans, enum bch_data_type data_type = parity ? BCH_DATA_parity : 0; s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; - bool gc = flags & BTREE_TRIGGER_GC; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g; struct bucket_mark new, old; char buf[200]; int ret = 0; + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + /* * XXX doesn't handle deletion */ percpu_down_read(&c->mark_lock); - g = PTR_BUCKET(ca, ptr, gc); + g = PTR_GC_BUCKET(ca, ptr); if (g->mark.dirty_sectors || (g->stripe && g->stripe != k.k->p.offset)) { @@ -853,7 +841,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, g->stripe = k.k->p.offset; g->stripe_redundancy = s->nr_redundant; - bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); + bch2_dev_usage_update(c, ca, old, new, journal_seq, true); err: percpu_up_read(&c->mark_lock); @@ -889,18 +877,19 @@ static int bch2_mark_pointer(struct btree_trans *trans, s64 sectors, enum bch_data_type data_type, unsigned flags) { - bool gc = flags & BTREE_TRIGGER_GC; u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; struct bucket_mark old, new; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); + struct bucket *g; u8 bucket_data_type; u64 v; int ret = 0; + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + percpu_down_read(&c->mark_lock); - g = PTR_BUCKET(ca, &p.ptr, gc); + g = PTR_GC_BUCKET(ca, &p.ptr); v = atomic64_read(&g->_mark.v); do { @@ -930,9 +919,7 @@ static int bch2_mark_pointer(struct btree_trans *trans, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); - - BUG_ON(!gc && bucket_became_unavailable(old, new)); + bch2_dev_usage_update(c, ca, old, new, journal_seq, true); err: percpu_up_read(&c->mark_lock); @@ -946,37 +933,35 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, s64 sectors, unsigned flags) { - bool gc = flags & BTREE_TRIGGER_GC; struct bch_fs *c = trans->c; struct bch_replicas_padded r; + struct gc_stripe *m; - if (!gc) { - BUG(); - } else { - struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); + BUG_ON(!(flags & BTREE_TRIGGER_GC)); - if (!m) - return -ENOMEM; + m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); - spin_lock(&c->ec_stripes_heap_lock); - - if (!m || !m->alive) { - spin_unlock(&c->ec_stripes_heap_lock); - bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", - (u64) p.idx); - bch2_inconsistent_error(c); - return -EIO; - } + if (!m) + return -ENOMEM; - m->block_sectors[p.block] += sectors; + spin_lock(&c->ec_stripes_heap_lock); - r = m->r; + if (!m || !m->alive) { spin_unlock(&c->ec_stripes_heap_lock); - - r.e.data_type = data_type; - update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, gc); + bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", + (u64) p.idx); + bch2_inconsistent_error(c); + return -EIO; } + m->block_sectors[p.block] += sectors; + + r = m->r; + spin_unlock(&c->ec_stripes_heap_lock); + + r.e.data_type = data_type; + update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); + return 0; } @@ -984,7 +969,6 @@ static int bch2_mark_extent(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) { - bool gc = flags & BTREE_TRIGGER_GC; u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; @@ -996,12 +980,14 @@ static int bch2_mark_extent(struct btree_trans *trans, ? BCH_DATA_btree : BCH_DATA_user; s64 sectors = bkey_is_btree_ptr(k.k) - ? c->opts.btree_node_size + ? btree_sectors(c) : k.k->size; s64 dirty_sectors = 0; bool stale; int ret; + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + r.e.data_type = data_type; r.e.nr_devs = 0; r.e.nr_required = 1; @@ -1022,7 +1008,7 @@ static int bch2_mark_extent(struct btree_trans *trans, if (p.ptr.cached) { if (!stale) { ret = update_cached_sectors(c, k, p.ptr.dev, - disk_sectors, journal_seq, gc); + disk_sectors, journal_seq, true); if (ret) { bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); return ret; @@ -1047,7 +1033,7 @@ static int bch2_mark_extent(struct btree_trans *trans, } if (r.e.nr_devs) { - ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, gc); + ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); if (ret) { char buf[200]; @@ -1114,7 +1100,11 @@ static int bch2_mark_stripe(struct btree_trans *trans, spin_unlock(&c->ec_stripes_heap_lock); } } else { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, idx); + struct gc_stripe *m = + genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); + + if (!m) + return -ENOMEM; /* * This will be wrong when we bring back runtime gc: we should @@ -1198,6 +1188,8 @@ static int bch2_mark_reservation(struct btree_trans *trans, unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size; + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + if (flags & BTREE_TRIGGER_OVERWRITE) sectors = -sectors; sectors *= replicas; @@ -1247,19 +1239,13 @@ not_found: */ if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu", p.k->p.inode, p.k->p.offset, p.k->size, *idx)) { - struct bkey_i_error *new; - - new = kmalloc(sizeof(*new), GFP_KERNEL); - if (!new) { - bch_err(c, "%s: error allocating new key", __func__); - return -ENOMEM; - } + struct bkey_i_error new; - bkey_init(&new->k); - new->k.type = KEY_TYPE_error; - new->k.p = p.k->p; - new->k.size = p.k->size; - ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i); + bkey_init(&new.k); + new.k.type = KEY_TYPE_error; + new.k.p = p.k->p; + new.k.size = p.k->size; + ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i); } fsck_err: return ret; @@ -1278,6 +1264,8 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, u64 end = le64_to_cpu(p.v->idx) + p.k->size; int ret = 0; + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) { idx -= le32_to_cpu(p.v->front_pad); end += le32_to_cpu(p.v->back_pad); @@ -1604,7 +1592,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, ? BCH_DATA_btree : BCH_DATA_user; s64 sectors = bkey_is_btree_ptr(k.k) - ? c->opts.btree_node_size + ? btree_sectors(c) : k.k->size; s64 dirty_sectors = 0; bool stale; @@ -2170,16 +2158,25 @@ static void buckets_free_rcu(struct rcu_head *rcu) buckets->nbuckets * sizeof(struct bucket)); } +static void bucket_gens_free_rcu(struct rcu_head *rcu) +{ + struct bucket_gens *buckets = + container_of(rcu, struct bucket_gens, rcu); + + kvpfree(buckets, sizeof(struct bucket_array) + buckets->nbuckets); +} + int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bucket_array *buckets = NULL, *old_buckets = NULL; + struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; unsigned long *buckets_nouse = NULL; alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; alloc_heap alloc_heap; size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, - ca->mi.bucket_size / c->opts.btree_node_size); + ca->mi.bucket_size / btree_sectors(c)); /* XXX: these should be tunable */ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6); @@ -2196,6 +2193,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO)) || + !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, + GFP_KERNEL|__GFP_ZERO)) || !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), GFP_KERNEL|__GFP_ZERO)) || @@ -2208,6 +2207,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) buckets->first_bucket = ca->mi.first_bucket; buckets->nbuckets = nbuckets; + bucket_gens->first_bucket = ca->mi.first_bucket; + bucket_gens->nbuckets = nbuckets; bch2_copygc_stop(c); @@ -2218,6 +2219,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) } old_buckets = bucket_array(ca); + old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); if (resize) { size_t n = min(buckets->nbuckets, old_buckets->nbuckets); @@ -2225,13 +2227,18 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) memcpy(buckets->b, old_buckets->b, n * sizeof(struct bucket)); + memcpy(bucket_gens->b, + old_bucket_gens->b, + n); memcpy(buckets_nouse, ca->buckets_nouse, BITS_TO_LONGS(n) * sizeof(unsigned long)); } rcu_assign_pointer(ca->buckets[0], buckets); - buckets = old_buckets; + rcu_assign_pointer(ca->bucket_gens, bucket_gens); + buckets = old_buckets; + bucket_gens = old_bucket_gens; swap(ca->buckets_nouse, buckets_nouse); @@ -2265,6 +2272,8 @@ err: free_fifo(&free[i]); kvpfree(buckets_nouse, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + if (bucket_gens) + call_rcu(&old_buckets->rcu, bucket_gens_free_rcu); if (buckets) call_rcu(&old_buckets->rcu, buckets_free_rcu); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index ac9b554acd86..45c6d230f242 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -53,11 +53,34 @@ static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) return buckets->b + b; } +static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) +{ + return __bucket(ca, b, true); +} + static inline struct bucket *bucket(struct bch_dev *ca, size_t b) { return __bucket(ca, b, false); } +static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) +{ + return rcu_dereference_check(ca->bucket_gens, + !ca->fs || + percpu_rwsem_is_held(&ca->fs->mark_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); + +} + +static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) +{ + struct bucket_gens *gens = bucket_gens(ca); + + BUG_ON(b < gens->first_bucket || b >= gens->nbuckets); + return gens->b + b; +} + /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree. @@ -75,10 +98,15 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, } static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, - const struct bch_extent_ptr *ptr, - bool gc) + const struct bch_extent_ptr *ptr) { - return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); + return bucket(ca, PTR_BUCKET_NR(ca, ptr)); +} + +static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr)); } static inline enum bch_data_type ptr_data_type(const struct bkey *k, @@ -91,18 +119,6 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k, return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; } -static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - struct bucket_mark m; - - rcu_read_lock(); - m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); - rcu_read_unlock(); - - return m; -} - static inline int gen_cmp(u8 a, u8 b) { return (s8) (a - b); @@ -122,7 +138,13 @@ static inline int gen_after(u8 a, u8 b) static inline u8 ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); + u8 ret; + + rcu_read_lock(); + ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); + rcu_read_unlock(); + + return ret; } /* bucket gc marks */ diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index b2de2995c5e7..18bca269b750 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -52,6 +52,13 @@ struct bucket_array { struct bucket b[]; }; +struct bucket_gens { + struct rcu_head rcu; + u16 first_bucket; + size_t nbuckets; + u8 b[]; +}; + struct bch_dev_usage { u64 buckets_ec; u64 buckets_unavailable; diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index f4a3e4854ef0..fbe8603cfb30 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -11,7 +11,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <crypto/skcipher.h> @@ -93,21 +93,21 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void * } } -static inline void do_encrypt_sg(struct crypto_skcipher *tfm, +static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_skcipher *tfm, +static inline void do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -120,8 +120,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm, int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { - struct crypto_skcipher *chacha20 = - crypto_alloc_skcipher("chacha20", 0, 0); + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; if (!chacha20) { @@ -129,7 +129,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, return PTR_ERR(chacha20); } - ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); if (ret) { pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; @@ -137,7 +138,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_skcipher(chacha20); + crypto_free_sync_skcipher(chacha20); return ret; } @@ -230,7 +231,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); #endif @@ -253,7 +254,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crypto_shash_update(desc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -498,7 +499,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) { bch_err(c, "error requesting chacha20 module: %li", PTR_ERR(c->chacha20)); @@ -581,7 +582,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -609,7 +610,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_skcipher(c->chacha20); + crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -641,7 +642,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto out; - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 5e0e77ca71a9..f5c1a609c5c4 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -7,7 +7,7 @@ #include "super-io.h" #include <linux/crc64.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> static inline bool bch2_checksum_mergeable(unsigned type) { @@ -140,9 +140,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 773cf87812ad..8e4179d8dc27 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -26,7 +26,7 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) { void *b; - BUG_ON(size > c->sb.encoded_extent_max << 9); + BUG_ON(size > c->opts.encoded_extent_max); b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); if (b) @@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) struct bvec_iter iter; void *expected_start = NULL; - __bio_for_each_segment(bv, bio, iter, start) { + __bio_for_each_bvec(bv, bio, iter, start) { if (expected_start && expected_start != page_address(bv.bv_page) + bv.bv_offset) return false; @@ -68,7 +68,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, struct page **pages = NULL; void *data; - BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); + BUG_ON(start.bi_size > c->opts.encoded_extent_max); if (!PageHighMem(bio_iter_page(bio, start)) && bio_phys_contig(bio, start)) @@ -231,8 +231,8 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, BUG_ON(!bio->bi_vcnt); BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - if (crc->uncompressed_size > c->sb.encoded_extent_max || - crc->compressed_size > c->sb.encoded_extent_max) { + if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || + crc->compressed_size << 9 > c->opts.encoded_extent_max) { bch_err(c, "error rewriting existing data: extent too big"); return -EIO; } @@ -272,8 +272,8 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, size_t dst_len = crc.uncompressed_size << 9; int ret = -ENOMEM; - if (crc.uncompressed_size > c->sb.encoded_extent_max || - crc.compressed_size > c->sb.encoded_extent_max) + if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || + crc.compressed_size << 9 > c->opts.encoded_extent_max) return -EIO; dst_data = dst_len == dst_iter.bi_size @@ -376,7 +376,7 @@ static unsigned __bio_compress(struct bch_fs *c, BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); /* If it's only one block, don't bother trying to compress: */ - if (bio_sectors(src) <= c->opts.block_size) + if (src->bi_iter.bi_size <= c->opts.block_size) return 0; dst_data = bio_map_or_bounce(c, dst, WRITE); @@ -466,7 +466,7 @@ unsigned bch2_bio_compress(struct bch_fs *c, /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, - c->sb.encoded_extent_max << 9); + c->opts.encoded_extent_max); /* Don't generate a bigger output than input: */ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); @@ -544,10 +544,9 @@ void bch2_fs_compress_exit(struct bch_fs *c) static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { - size_t max_extent = c->sb.encoded_extent_max << 9; size_t decompress_workspace_size = 0; bool decompress_workspace_needed; - ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0); + ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0); struct { unsigned feature; unsigned type; @@ -579,14 +578,14 @@ have_compressed: if (!mempool_initialized(&c->compression_bounce[READ])) { ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], - 1, max_extent); + 1, c->opts.encoded_extent_max); if (ret) goto out; } if (!mempool_initialized(&c->compression_bounce[WRITE])) { ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], - 1, max_extent); + 1, c->opts.encoded_extent_max); if (ret) goto out; } diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 666635f7c7d2..ee5b7f696796 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -373,7 +373,9 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, bch2_trans_init(&trans, i->c, 0, 0); - bch2_trans_iter_init(&trans, &iter, i->id, i->from, BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); while ((k = bch2_btree_iter_peek(&iter)).k && !(err = bkey_err(k))) { diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 993e4fb3ab64..3cccd1faade5 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -395,7 +395,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); while (offset < bytes) { - unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES, + unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, DIV_ROUND_UP(bytes, PAGE_SIZE)); unsigned b = min_t(size_t, bytes - offset, nr_iovecs << PAGE_SHIFT); @@ -1063,7 +1063,7 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) if (!ob) return NULL; - ca = bch_dev_bkey_exists(c, ob->ptr.dev); + ca = bch_dev_bkey_exists(c, ob->dev); offset = ca->mi.bucket_size - ob->sectors_free; return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); @@ -1152,7 +1152,7 @@ static void ec_stripe_key_init(struct bch_fs *c, s->v.algorithm = 0; s->v.nr_blocks = nr_data + nr_parity; s->v.nr_redundant = nr_parity; - s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); + s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); s->v.csum_type = BCH_CSUM_crc32c; s->v.pad = 0; @@ -1318,7 +1318,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, BUG_ON(j >= h->s->nr_data + h->s->nr_parity); h->s->blocks[j] = buckets.v[i]; - h->s->new_stripe.key.v.ptrs[j] = ob->ptr; + h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, h->s->blocks_gotten); } @@ -1346,7 +1346,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, BUG_ON(j >= h->s->nr_data); h->s->blocks[j] = buckets.v[i]; - h->s->new_stripe.key.v.ptrs[j] = ob->ptr; + h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, h->s->blocks_gotten); } @@ -1535,7 +1535,7 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) continue; ob = c->open_buckets + h->s->blocks[i]; - if (ob->ptr.dev == ca->dev_idx) + if (ob->dev == ca->dev_idx) goto found; } goto unlock; @@ -1608,46 +1608,6 @@ int bch2_stripes_read(struct bch_fs *c) return ret; } -int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - size_t i, idx = 0; - int ret = 0; - - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0); - - k = bch2_btree_iter_prev(&iter); - ret = bkey_err(k); - if (!ret && k.k) - idx = k.k->p.offset + 1; - - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); - if (ret) - return ret; - - if (!idx) - return 0; - - if (!gc && - !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), - GFP_KERNEL)) - return -ENOMEM; -#if 0 - ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); -#else - for (i = 0; i < idx; i++) - if (!gc - ? !genradix_ptr_alloc(&c->stripes, i, GFP_KERNEL) - : !genradix_ptr_alloc(&c->gc_stripes, i, GFP_KERNEL)) - return -ENOMEM; -#endif - return 0; -} - void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) { ec_stripes_heap *h = &c->ec_stripes_heap; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 468141072bb4..78d468c7680a 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -217,8 +217,6 @@ void bch2_stripes_heap_start(struct bch_fs *); int bch2_stripes_read(struct bch_fs *); -int bch2_ec_mem_alloc(struct bch_fs *, bool); - void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 89b5be907eea..44c584e9adaa 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -303,7 +303,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) if (lp.crc.csum_type && lp.crc.uncompressed_size + - rp.crc.uncompressed_size > c->sb.encoded_extent_max) + rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) return false; if (lp.crc.uncompressed_size + rp.crc.uncompressed_size > @@ -1038,7 +1038,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) if (k.k->type == KEY_TYPE_btree_ptr || k.k->type == KEY_TYPE_btree_ptr_v2) - size_ondisk = c->opts.btree_node_size; + size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index c8686838a314..9cdd03f3eeb0 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -285,28 +285,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page) /* for newly allocated pages: */ static void __bch2_page_state_release(struct page *page) { - struct bch_page_state *s = __bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + kfree(detach_page_private(page)); } static void bch2_page_state_release(struct page *page) { - struct bch_page_state *s = bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + EBUG_ON(!PageLocked(page)); + __bch2_page_state_release(page); } /* for newly allocated pages: */ @@ -320,13 +305,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page, return NULL; spin_lock_init(&s->lock); - /* - * migrate_page_move_mapping() assumes that pages with private data - * have their count elevated by 1. - */ - get_page(page); - set_page_private(page, (unsigned long) s); - SetPagePrivate(page); + attach_page_private(page, s); return s; } @@ -878,18 +857,12 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(newpage)); - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (PagePrivate(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (PagePrivate(page)) + attach_page_private(newpage, detach_page_private(page)); if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); @@ -903,10 +876,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, static void bch2_readpages_end_io(struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -925,31 +898,29 @@ struct readpages_iter { struct address_space *mapping; struct page **pages; unsigned nr_pages; - unsigned nr_added; unsigned idx; pgoff_t offset; }; static int readpages_iter_init(struct readpages_iter *iter, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) + struct readahead_control *ractl) { + unsigned i, nr_pages = readahead_count(ractl); + memset(iter, 0, sizeof(*iter)); - iter->mapping = mapping; - iter->offset = list_last_entry(pages, struct page, lru)->index; + iter->mapping = ractl->mapping; + iter->offset = readahead_index(ractl); + iter->nr_pages = nr_pages; iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); if (!iter->pages) return -ENOMEM; - while (!list_empty(pages)) { - struct page *page = list_last_entry(pages, struct page, lru); - - __bch2_page_state_create(page, __GFP_NOFAIL); - - iter->pages[iter->nr_pages++] = page; - list_del(&page->lru); + nr_pages = __readahead_batch(ractl, iter->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); + put_page(iter->pages[i]); } return 0; @@ -957,41 +928,9 @@ static int readpages_iter_init(struct readpages_iter *iter, static inline struct page *readpage_iter_next(struct readpages_iter *iter) { - struct page *page; - unsigned i; - int ret; - - BUG_ON(iter->idx > iter->nr_added); - BUG_ON(iter->nr_added > iter->nr_pages); - - if (iter->idx < iter->nr_added) - goto out; - - while (1) { - if (iter->idx == iter->nr_pages) - return NULL; - - ret = add_to_page_cache_lru_vec(iter->mapping, - iter->pages + iter->nr_added, - iter->nr_pages - iter->nr_added, - iter->offset + iter->nr_added, - GFP_NOFS); - if (ret > 0) - break; - - page = iter->pages[iter->nr_added]; - iter->idx++; - iter->nr_added++; - - __bch2_page_state_release(page); - put_page(page); - } - - iter->nr_added += ret; + if (iter->idx >= iter->nr_pages) + return NULL; - for (i = iter->idx; i < iter->nr_added; i++) - put_page(iter->pages[i]); -out: EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); return iter->pages[iter->idx]; @@ -1029,11 +968,8 @@ static void readpage_bio_extend(struct readpages_iter *iter, if (!get_more) break; - rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); - rcu_read_unlock(); - - if (page && !radix_tree_exceptional_entry(page)) + page = xa_load(&iter->mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) break; page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); @@ -1169,10 +1105,9 @@ err: bch2_bkey_buf_exit(&sk, c); } -int bch2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +void bch2_readahead(struct readahead_control *ractl) { - struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct btree_trans trans; @@ -1180,7 +1115,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, struct readpages_iter readpages_iter; int ret; - ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); + ret = readpages_iter_init(&readpages_iter, ractl); BUG_ON(ret); bch2_trans_init(&trans, c, 0, 0); @@ -1192,7 +1127,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, unsigned n = min_t(unsigned, readpages_iter.nr_pages - readpages_iter.idx, - BIO_MAX_PAGES); + BIO_MAX_VECS); struct bch_read_bio *rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), opts); @@ -1212,8 +1147,6 @@ int bch2_readpages(struct file *file, struct address_space *mapping, bch2_trans_exit(&trans); kfree(readpages_iter.pages); - - return 0; } static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, @@ -1308,36 +1241,37 @@ static void bch2_writepage_io_done(struct closure *cl) struct bch_writepage_io, cl); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i, j; + unsigned i; up(&io->op.c->io_in_flight); if (io->op.error) { set_bit(EI_INODE_ERROR, &io->inode->ei_flags); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; SetPageError(bvec->bv_page); - mapping_set_error(io->inode->v.i_mapping, -EIO); + mapping_set_error(bvec->bv_page->mapping, -EIO); s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } @@ -1361,7 +1295,7 @@ static void bch2_writepage_io_done(struct closure *cl) */ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s = __bch2_page_state(bvec->bv_page); if (atomic_dec_and_test(&s->write_count)) @@ -1395,7 +1329,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, { struct bch_write_op *op; - w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, + w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &c->writepage_bioset), struct bch_writepage_io, op.wbio.bio); @@ -1515,9 +1449,9 @@ do_io: if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio) || + bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= - (BIO_MAX_PAGES * PAGE_SIZE) || + (BIO_MAX_VECS * PAGE_SIZE) || bio_end_sector(&w->io->op.wbio.bio) != sector)) bch2_writepage_do_io(w); @@ -1793,8 +1727,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); unsigned pg_len = min_t(unsigned, len - copied, PAGE_SIZE - pg_offset); - unsigned pg_copied = iov_iter_copy_from_user_atomic(page, - iter, pg_offset, pg_len); + unsigned pg_copied = copy_page_from_iter_atomic(page, + pg_offset, pg_len,iter); if (!pg_copied) break; @@ -1807,7 +1741,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, } flush_dcache_page(page); - iov_iter_advance(iter, pg_copied); copied += pg_copied; if (pg_copied != pg_len) @@ -1925,18 +1858,6 @@ again: /* O_DIRECT reads */ -static void bio_release_pages(struct bio *bio, bool mark_dirty) -{ - struct bio_vec *bvec; - unsigned i; - - bio_for_each_segment_all(bvec, bio, i) { - if (mark_dirty && !PageCompound(bvec->bv_page)) - set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); - } -} - static void bio_check_or_release(struct bio *bio, bool check_dirty) { if (check_dirty) { @@ -2000,7 +1921,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) iter->count -= shorten; bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_PAGES), + iov_iter_npages(iter, BIO_MAX_VECS), &c->dio_read_bioset); bio->bi_end_io = bch2_direct_IO_read_endio; @@ -2035,7 +1956,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) goto start; while (iter->count) { bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_PAGES), + iov_iter_npages(iter, BIO_MAX_VECS), &c->bio_read); bio->bi_end_io = bch2_direct_IO_read_split_endio; start: @@ -2168,8 +2089,9 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct bch_inode_info *inode = file_bch_inode(req->ki_filp); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i, unaligned, iter_count; + unsigned unaligned, iter_count; bool sync = dio->sync, dropped_locks; long ret; @@ -2182,7 +2104,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) iter_count = dio->iter.count; if (kthread) - use_mm(dio->mm); + kthread_use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; @@ -2192,7 +2114,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) current->faults_disabled_mapping = NULL; if (kthread) - unuse_mm(dio->mm); + kthread_unuse_mm(dio->mm); /* * If the fault handler returned an error but also signalled @@ -2289,8 +2211,9 @@ loop: i_size_write(&inode->v, req->ki_pos); spin_unlock(&inode->v.i_lock); - bio_for_each_segment_all(bv, bio, i) - put_page(bv->bv_page); + if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); bio->bi_vcnt = 0; if (dio->op.error) { @@ -2314,8 +2237,9 @@ err: if (dio->free_iov) kfree(dio->iter.iov); - bio_for_each_segment_all(bv, bio, i) - put_page(bv->bv_page); + if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); bio_put(bio); /* inode->i_dio_count is our ref on inode and thus bch_fs */ @@ -2382,7 +2306,9 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) } bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_PAGES), + iov_iter_is_bvec(iter) + ? 0 + : iov_iter_npages(iter, BIO_MAX_VECS), &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); init_completion(&dio->done); @@ -2669,7 +2595,7 @@ static int bch2_extend(struct user_namespace *mnt_userns, truncate_setsize(&inode->v, iattr->ia_size); - return bch2_setattr_nonsize(inode, iattr); + return bch2_setattr_nonsize(mnt_userns, inode, iattr); } static int bch2_truncate_finish_fn(struct bch_inode_info *inode, @@ -2789,7 +2715,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0); mutex_unlock(&inode->ei_update_lock); - ret = bch2_setattr_nonsize(inode, iattr); + ret = bch2_setattr_nonsize(mnt_userns, inode, iattr); err: bch2_pagecache_block_put(&inode->ei_pagecache_lock); return ret; @@ -3230,235 +3156,6 @@ long bch2_fallocate_dispatch(struct file *file, int mode, return ret; } -static int generic_access_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - struct inode *inode = file->f_mapping->host; - loff_t max_size = inode->i_sb->s_maxbytes; - - if (!(file->f_flags & O_LARGEFILE)) - max_size = MAX_NON_LFS; - - if (unlikely(pos >= max_size)) - return -EFBIG; - *count = min(*count, max_size - pos); - return 0; -} - -static int generic_write_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - loff_t limit = rlimit(RLIMIT_FSIZE); - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - *count = min(*count, limit - pos); - } - - return generic_access_check_limits(file, pos, count); -} - -static int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) -{ - struct inode *inode_in = file_in->f_mapping->host; - struct inode *inode_out = file_out->f_mapping->host; - uint64_t count = *req_count; - uint64_t bcount; - loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; - int ret; - - /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) - return -EINVAL; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EINVAL; - - size_in = i_size_read(inode_in); - size_out = i_size_read(inode_out); - - /* Dedupe requires both ranges to be within EOF. */ - if ((remap_flags & REMAP_FILE_DEDUP) && - (pos_in >= size_in || pos_in + count > size_in || - pos_out >= size_out || pos_out + count > size_out)) - return -EINVAL; - - /* Ensure the infile range is within the infile. */ - if (pos_in >= size_in) - return -EINVAL; - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_access_check_limits(file_in, pos_in, &count); - if (ret) - return ret; - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* - * If the user wanted us to link to the infile's EOF, round up to the - * next block boundary for this check. - * - * Otherwise, make sure the count is also block-aligned, having - * already confirmed the starting offsets' block alignment. - */ - if (pos_in + count == size_in) { - bcount = ALIGN(size_in, bs) - pos_in; - } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); - bcount = count; - } - - /* Don't allow overlapped cloning within the same file. */ - if (inode_in == inode_out && - pos_out + bcount > pos_in && - pos_out < pos_in + bcount) - return -EINVAL; - - /* - * We shortened the request but the caller can't deal with that, so - * bounce the request back to userspace. - */ - if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) - return -EINVAL; - - *req_count = count; - return 0; -} - -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if ((remap_flags & REMAP_FILE_DEDUP) || - pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); - int ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) { - /* Update the timestamps, since we can alter file contents. */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - return ret; - } - - /* - * Clear the security bits if the process is not being run by - * root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - return ret; - } - - return 0; -} - loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index f9e7f49b13c7..b24efeaf343e 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *); int bch2_readpage(struct file *, struct page *); int bch2_writepages(struct address_space *, struct writeback_control *); -int bch2_readpages(struct file *, struct address_space *, - struct list_head *, unsigned); +void bch2_readahead(struct readahead_control *); int bch2_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **); @@ -36,10 +35,6 @@ int bch2_truncate(struct user_namespace *, struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -#define REMAP_FILE_ADVISORY (0) -#define REMAP_FILE_DEDUP (1 << 0) -#define REMAP_FILE_CAN_SHORTEN (1 << 1) - loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t, loff_t, unsigned); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index a76017386593..9f329a624c12 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -85,7 +85,7 @@ static int bch2_ioc_setflags(struct bch_fs *c, return ret; inode_lock(&inode->v); - if (!inode_owner_or_capable(&inode->v)) { + if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) { ret = -EACCES; goto setflags_out; } @@ -156,7 +156,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, return ret; inode_lock(&inode->v); - if (!inode_owner_or_capable(&inode->v)) { + if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) { ret = -EACCES; goto err; } @@ -268,22 +268,20 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) down_write(&c->vfs_sb->s_umount); switch (flags) { - case FSOP_GOING_FLAGS_DEFAULT: { - struct super_block *sb = freeze_bdev(c->vfs_sb->s_bdev); + case FSOP_GOING_FLAGS_DEFAULT: + ret = freeze_bdev(c->vfs_sb->s_bdev); if (ret) goto err; - if (sb && !IS_ERR(sb)) { - bch2_journal_flush(&c->journal); - c->vfs_sb->s_flags |= SB_RDONLY; - bch2_fs_emergency_read_only(c); - thaw_bdev(c->vfs_sb->s_bdev, sb); - } + bch2_journal_flush(&c->journal); + c->vfs_sb->s_flags |= SB_RDONLY; + bch2_fs_emergency_read_only(c); + thaw_bdev(c->vfs_sb->s_bdev); break; - } case FSOP_GOING_FLAGS_LOGFLUSH: bch2_journal_flush(&c->journal); + fallthrough; case FSOP_GOING_FLAGS_NOLOGFLUSH: c->vfs_sb->s_flags |= SB_RDONLY; @@ -379,7 +377,8 @@ retry: goto err3; } - error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(file_mnt_user_ns(filp), + dir, MAY_WRITE | MAY_EXEC); if (error) goto err3; @@ -394,7 +393,7 @@ retry: !arg.src_ptr) snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol; - inode = __bch2_create(NULL, to_bch_ei(dir), + inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir), dst_dentry, arg.mode|S_IFDIR, 0, snapshot_src, create_flags); error = PTR_ERR_OR_ZERO(inode); @@ -443,8 +442,10 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, dir = path.dentry->d_parent->d_inode; ret = __bch2_unlink(dir, path.dentry, true); - if (!ret) + if (!ret) { + fsnotify_rmdir(dir, path.dentry); d_delete(path.dentry); + } path_put(&path); return ret; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index b450a2feb52e..2d2ad7f768c0 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -264,7 +264,6 @@ __bch2_create(struct user_namespace *mnt_userns, unsigned flags) { struct bch_fs *c = dir->v.i_sb->s_fs_info; - struct user_namespace *ns = dir->v.i_sb->s_user_ns; struct btree_trans trans; struct bch_inode_unpacked dir_u; struct bch_inode_info *inode, *old; @@ -305,8 +304,8 @@ retry: inode_inum(dir), &dir_u, &inode_u, !(flags & BCH_CREATE_TMPFILE) ? &dentry->d_name : NULL, - from_kuid(ns, current_fsuid()), - from_kgid(ns, current_fsgid()), + from_kuid(mnt_userns, current_fsuid()), + from_kgid(mnt_userns, current_fsgid()), mode, rdev, default_acl, acl, snapshot_src, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, @@ -410,11 +409,12 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, return d_splice_alias(vinode, dentry); } -static int bch2_mknod(struct inode *vdir, struct dentry *dentry, +static int bch2_mknod(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct bch_inode_info *inode = - __bch2_create(NULL, to_bch_ei(vdir), dentry, mode, rdev, + __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, (subvol_inum) { 0 }, 0); if (IS_ERR(inode)) @@ -424,10 +424,11 @@ static int bch2_mknod(struct inode *vdir, struct dentry *dentry, return 0; } -static int bch2_create(struct inode *vdir, struct dentry *dentry, +static int bch2_create(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode, bool excl) { - return bch2_mknod(vdir, dentry, mode|S_IFREG, 0); + return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0); } static int __bch2_link(struct bch_fs *c, @@ -516,14 +517,15 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) return __bch2_unlink(vdir, dentry, false); } -static int bch2_symlink(struct inode *vdir, struct dentry *dentry, +static int bch2_symlink(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, const char *symname) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir), *inode; int ret; - inode = __bch2_create(NULL, dir, dentry, S_IFLNK|S_IRWXUGO, 0, + inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (unlikely(IS_ERR(inode))) return PTR_ERR(inode); @@ -550,12 +552,14 @@ err: return ret; } -static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode) +static int bch2_mkdir(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode) { - return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0); + return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0); } -static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, +static int bch2_rename2(struct user_namespace *mnt_userns, + struct inode *src_vdir, struct dentry *src_dentry, struct inode *dst_vdir, struct dentry *dst_dentry, unsigned flags) { @@ -661,7 +665,8 @@ err: return ret; } -static void bch2_setattr_copy(struct bch_inode_info *inode, +static void bch2_setattr_copy(struct user_namespace *mnt_userns, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, struct iattr *attr) { @@ -669,9 +674,9 @@ static void bch2_setattr_copy(struct bch_inode_info *inode, unsigned int ia_valid = attr->ia_valid; if (ia_valid & ATTR_UID) - bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid); + bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid); if (ia_valid & ATTR_GID) - bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid); + bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid); if (ia_valid & ATTR_SIZE) bi->bi_size = attr->ia_size; @@ -690,13 +695,14 @@ static void bch2_setattr_copy(struct bch_inode_info *inode, : inode->v.i_gid; if (!in_group_p(gid) && - !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID)) + !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID)) mode &= ~S_ISGID; bi->bi_mode = mode; } } -int bch2_setattr_nonsize(struct bch_inode_info *inode, +int bch2_setattr_nonsize(struct user_namespace *mnt_userns, + struct bch_inode_info *inode, struct iattr *attr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -733,7 +739,7 @@ retry: if (ret) goto btree_err; - bch2_setattr_copy(inode, &inode_u, attr); + bch2_setattr_copy(mnt_userns, inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u, @@ -765,7 +771,8 @@ err: return ret; } -static int bch2_getattr(const struct path *path, struct kstat *stat, +static int bch2_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned query_flags) { struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); @@ -805,26 +812,28 @@ static int bch2_getattr(const struct path *path, struct kstat *stat, return 0; } -static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) +static int bch2_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); int ret; lockdep_assert_held(&inode->v.i_rwsem); - ret = setattr_prepare(dentry, iattr); + ret = setattr_prepare(mnt_userns, dentry, iattr); if (ret) return ret; return iattr->ia_valid & ATTR_SIZE - ? bch2_truncate(NULL, inode, iattr) - : bch2_setattr_nonsize(inode, iattr); + ? bch2_truncate(mnt_userns, inode, iattr) + : bch2_setattr_nonsize(mnt_userns, inode, iattr); } -static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) +static int bch2_tmpfile(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode) { struct bch_inode_info *inode = - __bch2_create(NULL, to_bch_ei(vdir), dentry, mode, 0, + __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) @@ -857,8 +866,8 @@ static int bch2_fill_extent(struct bch_fs *c, else offset += p.crc.offset; - if ((offset & (c->opts.block_size - 1)) || - (k.k->size & (c->opts.block_size - 1))) + if ((offset & (block_sectors(c) - 1)) || + (k.k->size & (block_sectors(c) - 1))) flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ret = fiemap_fill_next_extent(info, @@ -903,6 +912,10 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, u32 snapshot; int ret = 0; + ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + if (start + len < start) return -EINVAL; @@ -1018,15 +1031,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) return bch2_readdir(c, inode_inum(inode), ctx); } -static int bch2_clone_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - u64 len) -{ - return bch2_remap_file_range(file_src, pos_src, - file_dst, pos_dst, - len, 0); -} - static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, .read_iter = bch2_read_iter, @@ -1041,7 +1045,7 @@ static const struct file_operations bch_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif - .clone_file_range = bch2_clone_file_range, + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { @@ -1111,7 +1115,7 @@ static const struct address_space_operations bch_address_space_operations = { .writepage = bch2_writepage, .readpage = bch2_readpage, .writepages = bch2_writepages, - .readpages = bch2_readpages, + .readahead = bch2_readahead, .set_page_dirty = __set_page_dirty_nobuffers, .write_begin = bch2_write_begin, .write_end = bch2_write_end, @@ -1568,13 +1572,14 @@ static int bch2_sync_fs(struct super_block *sb, int wait) static struct bch_fs *bch2_path_to_fs(const char *path) { struct bch_fs *c; - struct block_device *bdev = lookup_bdev(path); + dev_t dev; + int ret; - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + ret = lookup_bdev(path, &dev); + if (ret) + return ERR_PTR(ret); - c = bch2_dev_to_fs(bdev->bd_dev); - bdput(bdev); + c = bch2_dev_to_fs(dev); if (c) closure_put(&c->cl); return c ?: ERR_PTR(-ENOENT); @@ -1676,7 +1681,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); - if (!(opt->mode & OPT_MOUNT)) + if (!(opt->flags & OPT_MOUNT)) continue; if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) @@ -1830,6 +1835,8 @@ got_sb: sb->s_xattr = bch2_xattr_handlers; sb->s_magic = BCACHEFS_STATFS_MAGIC; sb->s_time_gran = c->sb.nsec_per_time_unit; + sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; + sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); c->vfs_sb = sb; strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); @@ -1837,9 +1844,7 @@ got_sb: if (ret) goto err_put_super; - sb->s_bdi->congested_fn = bch2_congested; - sb->s_bdi->congested_data = c; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; @@ -1858,7 +1863,7 @@ got_sb: sb->s_flags |= SB_POSIXACL; #endif - sb->s_shrink.seeks = 1; + sb->s_shrink.seeks = 0; vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); if (IS_ERR(vinode)) { diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index b5bc70afb100..b2211ec7f302 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -186,7 +186,8 @@ void bch2_inode_update_after_write(struct btree_trans *, int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, inode_set_fn, void *, unsigned); -int bch2_setattr_nonsize(struct bch_inode_info *, +int bch2_setattr_nonsize(struct user_namespace *, + struct bch_inode_info *, struct iattr *); int __bch2_unlink(struct inode *, struct dentry *, bool); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 3f8b2a06bc3e..361dbf338023 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2043,8 +2043,8 @@ static void inc_link(struct bch_fs *c, struct snapshots_seen *s, if (inum < range_start || inum >= range_end) return; - link = bsearch(&key, links->d, links->nr, - sizeof(links->d[0]), nlink_cmp); + link = __inline_bsearch(&key, links->d, links->nr, + sizeof(links->d[0]), nlink_cmp); if (!link) return; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 99b2a77ef9a8..ef6da53567b8 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -4,6 +4,7 @@ #include "btree_key_cache.h" #include "bkey_methods.h" #include "btree_update.h" +#include "buckets.h" #include "error.h" #include "extents.h" #include "extent_update.h" @@ -588,6 +589,8 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, int ret = 0; while (!ret || ret == -EINTR) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(trans->c, 0); struct btree_iter iter; struct bkey_s_c k; struct bkey_i delete; @@ -630,8 +633,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, } ret = bch2_trans_update(trans, &iter, &delete, 0) ?: - bch2_trans_commit(trans, NULL, NULL, + bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(trans->c, &disk_res); err: offset = iter.pos.offset; bch2_trans_iter_exit(trans, &iter); diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 6b969d8f1155..50b90b728a6d 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -136,10 +136,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -665,11 +665,7 @@ static void init_append_extent(struct bch_write_op *op, { struct bch_fs *c = op->c; struct bkey_i_extent *e; - struct open_bucket *ob; - unsigned i; - BUG_ON(crc.compressed_size > wp->sectors_free); - wp->sectors_free -= crc.compressed_size; op->pos.offset += crc.uncompressed_size; e = bkey_extent_init(op->insert_keys.top); @@ -682,22 +678,8 @@ static void init_append_extent(struct bch_write_op *op, crc.nonce) bch2_extent_crc_append(&e->k_i, crc); - open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - union bch_extent_entry *end = - bkey_val_end(bkey_i_to_s(&e->k_i)); - - end->ptr = ob->ptr; - end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - end->ptr.cached = !ca->mi.durability || - (op->flags & BCH_WRITE_CACHED) != 0; - end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; - - e->k.u64s++; - - BUG_ON(crc.compressed_size > ob->sectors_free); - ob->sectors_free -= crc.compressed_size; - } + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size, + op->flags & BCH_WRITE_CACHED); bch2_keylist_push(&op->insert_keys); } @@ -717,7 +699,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, ? ((unsigned long) buf & (PAGE_SIZE - 1)) : 0), PAGE_SIZE); - pages = min_t(unsigned, pages, BIO_MAX_PAGES); + pages = min(pages, BIO_MAX_VECS); bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); wbio = wbio_init(bio); @@ -738,7 +720,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, */ bch2_bio_alloc_pages_pool(c, bio, min_t(unsigned, output_available, - c->sb.encoded_extent_max << 9)); + c->opts.encoded_extent_max)); if (bio->bi_iter.bi_size < output_available) *page_alloc_failed = @@ -935,8 +917,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, size_t dst_len, src_len; if (page_alloc_failed && - bio_sectors(dst) < wp->sectors_free && - bio_sectors(dst) < c->sb.encoded_extent_max) + dst->bi_iter.bi_size < (wp->sectors_free << 9) && + dst->bi_iter.bi_size < c->opts.encoded_extent_max) break; BUG_ON(op->compression_type && @@ -956,7 +938,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, if (op->csum_type) dst_len = min_t(unsigned, dst_len, - c->sb.encoded_extent_max << 9); + c->opts.encoded_extent_max); if (bounce) { swap(dst->bi_iter.bi_size, dst_len); @@ -1289,7 +1271,7 @@ void bch2_write(struct closure *cl) bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; - if (bio_sectors(bio) & (c->opts.block_size - 1)) { + if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { bch_err_inum_ratelimited(c, op->pos.inode, "misaligned write"); op->error = -EIO; @@ -2366,8 +2348,8 @@ int bch2_fs_io_init(struct bch_fs *c) mempool_init_page_pool(&c->bio_bounce_pages, max_t(unsigned, c->opts.btree_node_size, - c->sb.encoded_extent_max) / - PAGE_SECTORS, 0) || + c->opts.encoded_extent_max) / + PAGE_SIZE, 0) || rhashtable_init(&c->promote_table, &bch_promote_params)) return -ENOMEM; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index ff8b81fa6772..158df42e5e10 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -642,6 +642,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) int bch2_journal_meta(struct journal *j) { + struct journal_buf *buf; struct journal_res res; int ret; @@ -651,6 +652,10 @@ int bch2_journal_meta(struct journal *j) if (ret) return ret; + buf = j->buf + (res.seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + set_bit(JOURNAL_NEED_WRITE, &j->flags); + bch2_journal_res_put(j, &res); return bch2_journal_flush_seq(j, res.seq); @@ -700,6 +705,44 @@ int bch2_journal_flush(struct journal *j) return bch2_journal_flush_seq(j, seq); } +/* + * bch2_journal_noflush_seq - tell the journal not to issue any flushes before + * @seq + */ +bool bch2_journal_noflush_seq(struct journal *j, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + u64 unwritten_seq; + bool ret = false; + + if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) + return false; + + if (seq <= c->journal.flushed_seq_ondisk) + return false; + + spin_lock(&j->lock); + if (seq <= c->journal.flushed_seq_ondisk) + goto out; + + for (unwritten_seq = last_unwritten_seq(j); + unwritten_seq < seq; + unwritten_seq++) { + struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); + + /* journal write is already in flight, and was a flush write: */ + if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush) + goto out; + + buf->noflush = true; + } + + ret = true; +out: + spin_unlock(&j->lock); + return ret; +} + /* block/unlock the journal: */ void bch2_journal_unblock(struct journal *j) @@ -770,11 +813,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, long b; if (new_fs) { - if (c) - percpu_down_read(&c->mark_lock); b = bch2_bucket_alloc_new_fs(ca); if (b < 0) { - percpu_up_read(&c->mark_lock); ret = -ENOSPC; goto err; } @@ -788,7 +828,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, goto err; } - b = sector_to_bucket(ca, ob->ptr.offset); + b = ob->bucket; } if (c) @@ -822,14 +862,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (c) spin_unlock(&c->journal.lock); - if (new_fs) { - bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, - ca->mi.bucket_size, - gc_phase(GC_PHASE_SB), - 0); - if (c) - percpu_up_read(&c->mark_lock); - } else { + if (!new_fs) { ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_trans_mark_metadata_bucket(&trans, ca, b, BCH_DATA_journal, @@ -995,10 +1028,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, j->replay_journal_seq = last_seq; j->replay_journal_seq_end = cur_seq; j->last_seq_ondisk = last_seq; + j->flushed_seq_ondisk = cur_seq - 1; j->pin.front = last_seq; j->pin.back = cur_seq; atomic64_set(&j->seq, cur_seq - 1); + if (list_empty(journal_entries)) + j->last_empty_seq = cur_seq - 1; + fifo_for_each_entry_ptr(p, &j->pin, seq) journal_pin_list_init(p, 1); @@ -1011,6 +1048,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, if (seq < last_seq) continue; + if (journal_entry_empty(&i->j)) + j->last_empty_seq = le64_to_cpu(i->j.seq); + p = journal_seq_pin(j, seq); p->devs.nr = 0; @@ -1018,6 +1058,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); } + if (list_empty(journal_entries)) + j->last_empty_seq = cur_seq; + spin_lock(&j->lock); set_bit(JOURNAL_STARTED, &j->flags); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index c580a1dff903..b298873212d2 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -278,7 +278,7 @@ static inline void bch2_journal_res_put(struct journal *j, if (!res->ref) return; - lock_release(&j->res_map, 0, _THIS_IP_); + lock_release(&j->res_map, _THIS_IP_); while (res->u64s) bch2_journal_add_entry(j, res, @@ -477,6 +477,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush(struct journal *); +bool bch2_journal_noflush_seq(struct journal *, u64); int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 80e0dd311ffd..77201a0ee21d 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -710,7 +710,7 @@ reread: case JOURNAL_ENTRY_NONE: if (!saw_bad) return 0; - sectors = c->opts.block_size; + sectors = block_sectors(c); goto next_block; case JOURNAL_ENTRY_BAD: saw_bad = true; @@ -719,7 +719,7 @@ reread: * field of the journal entry we read, so try reading * again at next block boundary: */ - sectors = c->opts.block_size; + sectors = block_sectors(c); break; default: return ret; @@ -1399,9 +1399,10 @@ void bch2_journal_write(struct closure *cl) spin_lock(&j->lock); if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && - !w->must_flush && - (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && - test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + (w->noflush || + (!w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { w->noflush = true; SET_JSET_NO_FLUSH(jset, true); jset->last_seq = 0; @@ -1448,7 +1449,7 @@ void bch2_journal_write(struct closure *cl) SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); - if (journal_entry_empty(jset)) + if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) j->last_empty_seq = le64_to_cpu(jset->seq); if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 310c121c5950..ab9a6d966d5e 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -11,7 +11,6 @@ #include <linux/kthread.h> #include <linux/sched/mm.h> -#include <linux/sched/task.h> #include <trace/events/bcachefs.h> /* Free space calculations: */ diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 79bc0e49389b..10bd23e969d2 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -235,81 +235,3 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { .validate = bch2_sb_journal_seq_blacklist_validate, .to_text = bch2_sb_journal_seq_blacklist_to_text }; - -void bch2_blacklist_entries_gc(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, - journal_seq_blacklist_gc_work); - struct journal_seq_blacklist_table *t; - struct bch_sb_field_journal_seq_blacklist *bl; - struct journal_seq_blacklist_entry *src, *dst; - struct btree_trans trans; - unsigned i, nr, new_nr; - int ret; - - bch2_trans_init(&trans, c, 0, 0); - - for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_iter iter; - struct btree *b; - - bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN, - 0, 0, BTREE_ITER_PREFETCH); -retry: - bch2_trans_begin(&trans); - - b = bch2_btree_iter_peek_node(&iter); - - while (!(ret = PTR_ERR_OR_ZERO(b)) && - b && - !test_bit(BCH_FS_STOPPING, &c->flags)) - b = bch2_btree_iter_next_node(&iter); - - if (ret == -EINTR) - goto retry; - - bch2_trans_iter_exit(&trans, &iter); - } - - bch2_trans_exit(&trans); - if (ret) - return; - - mutex_lock(&c->sb_lock); - bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); - if (!bl) - goto out; - - nr = blacklist_nr_entries(bl); - dst = bl->start; - - t = c->journal_seq_blacklist_table; - BUG_ON(nr != t->nr); - - for (src = bl->start, i = eytzinger0_first(t->nr); - src < bl->start + nr; - src++, i = eytzinger0_next(i, nr)) { - BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); - BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); - - if (t->entries[i].dirty) - *dst++ = *src; - } - - new_nr = dst - bl->start; - - bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); - - if (new_nr != nr) { - bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, - new_nr ? sb_blacklist_u64s(new_nr) : 0); - BUG_ON(new_nr && !bl); - - if (!new_nr) - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); - - bch2_write_super(c); - } -out: - mutex_unlock(&c->sb_lock); -} diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index afb886ec8e25..b4f876a04586 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -17,6 +17,4 @@ int bch2_blacklist_table_initialize(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; -void bch2_blacklist_entries_gc(struct work_struct *); - #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 9b6e402e19f0..f73be9cb7ac3 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -427,12 +427,12 @@ static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) if (bv->bv_page) __free_page(bv->bv_page); @@ -784,6 +784,14 @@ out: return ret; } +inline void bch_move_stats_init(struct bch_move_stats *stats, char *name) +{ + memset(stats, 0, sizeof(*stats)); + + scnprintf(stats->name, sizeof(stats->name), + "%s", name); +} + static inline void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats) { diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 98323ad93e7c..2a789a1158ca 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -66,13 +66,8 @@ int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); -static inline void bch_move_stats_init(struct bch_move_stats *stats, char *name) -{ - memset(stats, 0, sizeof(*stats)); - - scnprintf(stats->name, sizeof(stats->name), - "%s", name); -} +inline void bch_move_stats_init(struct bch_move_stats *stats, + char *name); #endif /* _BCACHEFS_MOVE_H */ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index e81e07a383bb..d9ca69f2ecde 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -66,11 +66,6 @@ const char * const bch2_data_types[] = { NULL }; -const char * const bch2_cache_replacement_policies[] = { - BCH_CACHE_REPLACEMENT_POLICIES() - NULL -}; - const char * const bch2_member_states[] = { BCH_MEMBER_STATES() NULL @@ -141,41 +136,27 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) } } -/* - * Initial options from superblock - here we don't want any options undefined, - * any options the superblock doesn't specify are set to 0: - */ -struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) -{ - struct bch_opts opts = bch2_opts_empty(); - -#define x(_name, _bits, _mode, _type, _sb_opt, ...) \ - if (_sb_opt != NO_SB_OPT) \ - opt_set(opts, _name, _sb_opt(sb)); - BCH_OPTS() -#undef x - - return opts; -} - const struct bch_option bch2_opt_table[] = { -#define OPT_BOOL() .type = BCH_OPT_BOOL -#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max -#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max -#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices +#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 +#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ + .min = _min, .max = _max +#define OPT_STR(_choices) .type = BCH_OPT_STR, \ + .min = 0, .max = ARRAY_SIZE(_choices),\ + .choices = _choices #define OPT_FN(_fn) .type = BCH_OPT_FN, \ .parse = _fn##_parse, \ .to_text = _fn##_to_text -#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \ +#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ [Opt_##_name] = { \ .attr = { \ .name = #_name, \ - .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \ + .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ }, \ - .mode = _mode, \ + .flags = _flags, \ .hint = _hint, \ .help = _help, \ + .get_sb = _sb_opt, \ .set_sb = SET_##_sb_opt, \ _type \ }, @@ -218,7 +199,41 @@ static int bch2_mount_opt_lookup(const char *name) return bch2_opt_lookup(name); } -int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, +static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v) +{ + if (v < opt->min) { + if (msg) + pr_err("invalid %s%s: too small (min %llu)", + msg, opt->attr.name, opt->min); + return -ERANGE; + } + + if (opt->max && v >= opt->max) { + if (msg) + pr_err("invalid %s%s: too big (max %llu)", + msg, opt->attr.name, opt->max); + return -ERANGE; + } + + if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { + if (msg) + pr_err("invalid %s %s: not a multiple of 512", + msg, opt->attr.name); + return -EINVAL; + } + + if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { + if (msg) + pr_err("invalid %s%s: must be a power of two", + msg, opt->attr.name); + return -EINVAL; + } + + return 0; +} + +int bch2_opt_parse(struct bch_fs *c, const char *msg, + const struct bch_option *opt, const char *val, u64 *res) { ssize_t ret; @@ -228,30 +243,13 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, ret = kstrtou64(val, 10, res); if (ret < 0) return ret; - - if (*res > 1) - return -ERANGE; break; case BCH_OPT_UINT: - ret = kstrtou64(val, 10, res); + ret = opt->flags & OPT_HUMAN_READABLE + ? bch2_strtou64_h(val, res) + : kstrtou64(val, 10, res); if (ret < 0) return ret; - - if (*res < opt->min || *res >= opt->max) - return -ERANGE; - break; - case BCH_OPT_SECTORS: - ret = bch2_strtou64_h(val, res); - if (ret < 0) - return ret; - - if (*res & 511) - return -EINVAL; - - *res >>= 9; - - if (*res < opt->min || *res >= opt->max) - return -ERANGE; break; case BCH_OPT_STR: ret = match_string(opt->choices, -1, val); @@ -264,10 +262,12 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, if (!c) return 0; - return opt->parse(c, val, res); + ret = opt->parse(c, val, res); + if (ret < 0) + return ret; } - return 0; + return bch2_opt_validate(opt, msg, *res); } void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, @@ -288,10 +288,10 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, switch (opt->type) { case BCH_OPT_BOOL: case BCH_OPT_UINT: - pr_buf(out, "%lli", v); - break; - case BCH_OPT_SECTORS: - bch2_hprint(out, v << 9); + if (opt->flags & OPT_HUMAN_READABLE) + bch2_hprint(out, v); + else + pr_buf(out, "%lli", v); break; case BCH_OPT_STR: if (flags & OPT_SHOW_FULL_LIST) @@ -365,7 +365,8 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, if (id < 0) goto bad_opt; - ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v); + ret = bch2_opt_parse(c, "mount option ", + &bch2_opt_table[id], val, &v); if (ret < 0) goto bad_val; } else { @@ -385,7 +386,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, goto no_val; } - if (!(bch2_opt_table[id].mode & OPT_MOUNT)) + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) goto bad_opt; if (id == Opt_acl && @@ -420,6 +421,65 @@ out: return ret; } +/* + * Initial options from superblock - here we don't want any options undefined, + * any options the superblock doesn't specify are set to 0: + */ +int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) +{ + unsigned id; + int ret; + + for (id = 0; id < bch2_opts_nr; id++) { + const struct bch_option *opt = bch2_opt_table + id; + u64 v; + + if (opt->get_sb == NO_SB_OPT) + continue; + + v = opt->get_sb(sb); + + if (opt->flags & OPT_SB_FIELD_ILOG2) + v = 1ULL << v; + + if (opt->flags & OPT_SB_FIELD_SECTORS) + v <<= 9; + + ret = bch2_opt_validate(opt, "superblock option ", v); + if (ret) + return ret; + + bch2_opt_set_by_id(opts, id, v); + } + + return 0; +} + +void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) +{ + if (opt->set_sb == SET_NO_SB_OPT) + return; + + if (opt->flags & OPT_SB_FIELD_SECTORS) + v >>= 9; + + if (opt->flags & OPT_SB_FIELD_ILOG2) + v = ilog2(v); + + opt->set_sb(sb, v); +} + +void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) +{ + if (opt->set_sb == SET_NO_SB_OPT) + return; + + mutex_lock(&c->sb_lock); + __bch2_opt_set_sb(c->disk_sb.sb, opt, v); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + /* io opts: */ struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 871142778763..661eb5764f68 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -19,7 +19,6 @@ extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; extern const char * const bch2_data_types[]; -extern const char * const bch2_cache_replacement_policies[]; extern const char * const bch2_member_states[]; extern const char * const bch2_d_types[]; @@ -44,19 +43,22 @@ static inline const char *bch2_d_type_str(unsigned d_type) LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); /* When can be set: */ -enum opt_mode { +enum opt_flags { OPT_FS = (1 << 0), /* Filesystem option */ OPT_DEVICE = (1 << 1), /* Device option */ OPT_INODE = (1 << 2), /* Inode option */ OPT_FORMAT = (1 << 3), /* May be specified at format time */ OPT_MOUNT = (1 << 4), /* May be specified at mount time */ OPT_RUNTIME = (1 << 5), /* May be specified at runtime */ + OPT_HUMAN_READABLE = (1 << 6), + OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */ + OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */ + OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */ }; enum opt_type { BCH_OPT_BOOL, BCH_OPT_UINT, - BCH_OPT_SECTORS, BCH_OPT_STR, BCH_OPT_FN, }; @@ -88,13 +90,15 @@ enum opt_type { #define BCH_OPTS() \ x(block_size, u16, \ - OPT_FS|OPT_FORMAT, \ - OPT_SECTORS(1, 128), \ + OPT_FS|OPT_FORMAT| \ + OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ + OPT_UINT(512, 1U << 16), \ BCH_SB_BLOCK_SIZE, 8, \ "size", NULL) \ - x(btree_node_size, u16, \ - OPT_FS|OPT_FORMAT, \ - OPT_SECTORS(1, 512), \ + x(btree_node_size, u32, \ + OPT_FS|OPT_FORMAT| \ + OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ + OPT_UINT(512, 1U << 20), \ BCH_SB_BTREE_NODE_SIZE, 512, \ "size", "Btree node size, default 256k") \ x(errors, u8, \ @@ -122,6 +126,12 @@ enum opt_type { OPT_UINT(1, BCH_REPLICAS_MAX), \ BCH_SB_DATA_REPLICAS_REQ, 1, \ "#", NULL) \ + x(encoded_extent_max, u32, \ + OPT_FS|OPT_FORMAT| \ + OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\ + OPT_UINT(4096, 2U << 20), \ + BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ + "size", "Maximum size of checksummed/compressed extents")\ x(metadata_checksum, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_csum_opts), \ @@ -198,8 +208,9 @@ enum opt_type { BCH_SB_GC_RESERVE, 8, \ "%", "Percentage of disk space to reserve for copygc")\ x(gc_reserve_bytes, u64, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_SECTORS(0, U64_MAX), \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \ + OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ + OPT_UINT(0, U64_MAX), \ BCH_SB_GC_RESERVE_BYTES, 0, \ "%", "Amount of disk space to reserve for copygc\n" \ "Takes precedence over gc_reserve_percent if set")\ @@ -354,12 +365,12 @@ enum opt_type { NULL, NULL) \ x(fs_size, u64, \ OPT_DEVICE, \ - OPT_SECTORS(0, S64_MAX), \ + OPT_UINT(0, S64_MAX), \ NO_SB_OPT, 0, \ "size", "Size of filesystem on device") \ x(bucket, u32, \ OPT_DEVICE, \ - OPT_SECTORS(0, S64_MAX), \ + OPT_UINT(0, S64_MAX), \ NO_SB_OPT, 0, \ "size", "Size of filesystem on device") \ x(durability, u8, \ @@ -418,13 +429,14 @@ struct printbuf; struct bch_option { struct attribute attr; + u64 (*get_sb)(const struct bch_sb *); void (*set_sb)(struct bch_sb *, u64); - enum opt_mode mode; enum opt_type type; + enum opt_flags flags; + u64 min, max; union { struct { - u64 min, max; }; struct { const char * const *choices; @@ -446,10 +458,13 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); -struct bch_opts bch2_opts_from_sb(struct bch_sb *); +int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); +void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); +void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); int bch2_opt_lookup(const char *); -int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *); +int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *, + const char *, u64 *); #define OPT_SHOW_FULL_LIST (1 << 0) #define OPT_SHOW_MOUNT_STYLE (1 << 1) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 29fe6260ace5..8b0e468f45a0 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -115,21 +115,12 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, struct journal_key n = { .btree_id = id, .level = level, - .k = k, .allocated = true }; struct journal_keys *keys = &c->journal_keys; struct journal_iter *iter; unsigned idx = journal_key_search(keys, id, level, k->k.p); - if (idx < keys->nr && - journal_key_cmp(&n, &keys->d[idx]) == 0) { - if (keys->d[idx].allocated) - kfree(keys->d[idx].k); - keys->d[idx] = n; - return 0; - } - if (keys->nr == keys->size) { struct journal_keys new_keys = { .nr = keys->nr, @@ -149,10 +140,23 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, *keys = new_keys; } - array_insert_item(keys->d, keys->nr, idx, n); + n.k = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); + if (!n.k) + return -ENOMEM; + + bkey_copy(n.k, k); + + if (idx < keys->nr && + journal_key_cmp(&n, &keys->d[idx]) == 0) { + if (keys->d[idx].allocated) + kfree(keys->d[idx].k); + keys->d[idx] = n; + } else { + array_insert_item(keys->d, keys->nr, idx, n); - list_for_each_entry(iter, &c->journal_iters, list) - journal_iter_fix(c, iter, idx); + list_for_each_entry(iter, &c->journal_iters, list) + journal_iter_fix(c, iter, idx); + } return 0; } @@ -160,22 +164,12 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, unsigned level, struct bpos pos) { - struct bkey_i *whiteout = - kmalloc(sizeof(struct bkey), GFP_KERNEL); - int ret; - - if (!whiteout) { - bch_err(c, "%s: error allocating new key", __func__); - return -ENOMEM; - } + struct bkey_i whiteout; - bkey_init(&whiteout->k); - whiteout->k.p = pos; + bkey_init(&whiteout.k); + whiteout.k.p = pos; - ret = bch2_journal_key_insert(c, id, level, whiteout); - if (ret) - kfree(whiteout); - return ret; + return bch2_journal_key_insert(c, id, level, &whiteout); } static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) @@ -1149,16 +1143,6 @@ use_clean: if (ret) goto err; - /* - * After an unclean shutdown, skip then next few journal sequence - * numbers as they may have been referenced by btree writes that - * happened before their corresponding journal writes - those btree - * writes need to be ignored, by skipping and blacklisting the next few - * journal sequence numbers: - */ - if (!c->sb.clean) - journal_seq += 8; - if (blacklist_seq != journal_seq) { ret = bch2_journal_seq_blacklist_add(c, blacklist_seq, journal_seq); @@ -1295,24 +1279,15 @@ use_clean: bch_verbose(c, "quotas done"); } - if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || - !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { - struct bch_move_stats stats; - - bch_move_stats_init(&stats, "recovery"); - - bch_info(c, "scanning for old btree nodes"); - ret = bch2_fs_read_write(c); - if (ret) - goto err; - - ret = bch2_scan_old_btree_nodes(c, &stats); - if (ret) - goto err; - bch_info(c, "scanning for old btree nodes done"); - } - mutex_lock(&c->sb_lock); + /* + * With journal replay done, we can clear the journal seq blacklist + * table: + */ + BUG_ON(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); + if (le16_to_cpu(c->sb.version_min) >= bcachefs_metadata_version_btree_ptr_sectors_written) + bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 0); + if (c->opts.version_upgrade) { c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); @@ -1336,9 +1311,23 @@ use_clean: bch2_write_super(c); mutex_unlock(&c->sb_lock); - if (c->journal_seq_blacklist_table && - c->journal_seq_blacklist_table->nr > 128) - queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); + if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || + le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) { + struct bch_move_stats stats; + + bch_move_stats_init(&stats, "recovery"); + + bch_info(c, "scanning for old btree nodes"); + ret = bch2_fs_read_write(c); + if (ret) + goto err; + + ret = bch2_scan_old_btree_nodes(c, &stats); + if (ret) + goto err; + bch_info(c, "scanning for old btree nodes done"); + } ret = 0; out: @@ -1383,9 +1372,6 @@ int bch2_fs_initialize(struct bch_fs *c) c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); bch2_write_super(c); } - - for_each_online_member(ca, c, i) - bch2_mark_dev_superblock(c, ca, 0); mutex_unlock(&c->sb_lock); set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); @@ -1429,6 +1415,8 @@ int bch2_fs_initialize(struct bch_fs *c) percpu_ref_put(&ca->ref); goto err; } + + ca->new_fs_bucket_idx = 0; } err = "error creating root snapshot node"; diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 5de733b95aa4..57d636740d2f 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -13,7 +13,7 @@ #include <linux/crc32c.h> #include <crypto/hash.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> static inline enum bch_str_hash_type bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 88a8e54fbd7a..b8d2cf66a630 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -261,8 +261,7 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) block_size = le16_to_cpu(sb->block_size); - if (!is_power_of_2(block_size) || - block_size > PAGE_SECTORS) + if (block_size > PAGE_SECTORS) return "Bad block size"; if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) @@ -304,9 +303,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) if (!BCH_SB_BTREE_NODE_SIZE(sb)) return "Btree node size not set"; - if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) - return "Btree node size not a power of two"; - if (BCH_SB_GC_RESERVE(sb) < 5) return "gc reserve percentage too small"; @@ -366,7 +362,6 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.nr_devices = src->nr_devices; c->sb.clean = BCH_SB_CLEAN(src); c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); - c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; @@ -621,8 +616,12 @@ got_super: err = "Superblock block size smaller than device block size"; ret = -EINVAL; if (le16_to_cpu(sb->sb->block_size) << 9 < - bdev_logical_block_size(sb->bdev)) - goto err; + bdev_logical_block_size(sb->bdev)) { + pr_err("error reading superblock: Superblock block size (%u) smaller than device block size (%u)", + le16_to_cpu(sb->sb->block_size) << 9, + bdev_logical_block_size(sb->bdev)); + goto err_no_print; + } ret = 0; sb->have_layout = true; @@ -630,8 +629,9 @@ out: pr_verbose_init(*opts, "ret %i", ret); return ret; err: - bch2_free_super(sb); pr_err("error reading superblock: %s", err); +err_no_print: + bch2_free_super(sb); goto out; } diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index b64ac2fbbf8b..5c264875acb4 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -110,7 +110,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) .bucket_size = le16_to_cpu(mi->bucket_size), .group = BCH_MEMBER_GROUP(mi), .state = BCH_MEMBER_STATE(mi), - .replacement = BCH_MEMBER_REPLACEMENT(mi), .discard = BCH_MEMBER_DISCARD(mi), .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), .durability = BCH_MEMBER_DURABILITY(mi) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 5a3bb543b99d..1dbbf5231567 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -166,44 +166,6 @@ static void bch2_dev_usage_journal_reserve(struct bch_fs *c) &c->dev_usage_journal_res, u64s * nr); } -int bch2_congested(void *data, int bdi_bits) -{ - struct bch_fs *c = data; - struct backing_dev_info *bdi; - struct bch_dev *ca; - unsigned i; - int ret = 0; - - rcu_read_lock(); - if (bdi_bits & (1 << WB_sync_congested)) { - /* Reads - check all devices: */ - for_each_readable_member(ca, c, i) { - bdi = ca->disk_sb.bdev->bd_bdi; - - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - } else { - const struct bch_devs_mask *devs = - bch2_target_to_mask(c, c->opts.foreground_target) ?: - &c->rw_devs[BCH_DATA_user]; - - for_each_member_device_rcu(ca, c, i, devs) { - bdi = ca->disk_sb.bdev->bd_bdi; - - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - } - rcu_read_unlock(); - - return ret; -} - /* Filesystem RO/RW: */ /* @@ -566,8 +528,6 @@ void __bch2_fs_stop(struct bch_fs *c) set_bit(BCH_FS_STOPPING, &c->flags); - cancel_work_sync(&c->journal_seq_blacklist_gc_work); - down_write(&c->state_lock); bch2_fs_read_only(c); up_write(&c->state_lock); @@ -575,8 +535,7 @@ void __bch2_fs_stop(struct bch_fs *c) for_each_member_device(ca, c, i) if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev) - sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, - "bcachefs"); + sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); if (c->kobj.state_in_sysfs) kobject_del(&c->kobj); @@ -731,9 +690,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) spin_lock_init(&c->btree_write_error_lock); - INIT_WORK(&c->journal_seq_blacklist_gc_work, - bch2_blacklist_entries_gc); - INIT_LIST_HEAD(&c->journal_entries); INIT_LIST_HEAD(&c->journal_iters); @@ -793,10 +749,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); c->opts = bch2_opts_default; - bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb)); + ret = bch2_opts_from_sb(&c->opts, sb); + if (ret) + goto err; + bch2_opts_apply(&c->opts, opts); - c->block_bits = ilog2(c->opts.block_size); + c->block_bits = ilog2(block_sectors(c)); c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); if (bch2_fs_init_fault("fs_alloc")) { @@ -908,7 +867,7 @@ static void print_mount_opts(struct bch_fs *c) const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); - if (!(opt->mode & OPT_MOUNT)) + if (!(opt->flags & OPT_MOUNT)) continue; if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) @@ -1034,7 +993,7 @@ static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) if (!sb_mi) return "Invalid superblock: member info area missing"; - if (le16_to_cpu(sb->block_size) != c->opts.block_size) + if (le16_to_cpu(sb->block_size) != block_sectors(c)) return "mismatched block size"; if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < @@ -1079,8 +1038,7 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev) - sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, - "bcachefs"); + sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); @@ -1116,10 +1074,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) wait_for_completion(&ca->io_ref_completion); if (ca->kobj.state_in_sysfs) { - struct kobject *block = - &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; - - sysfs_remove_link(block, "bcachefs"); + sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); sysfs_remove_link(&ca->kobj, "block"); } @@ -1156,12 +1111,12 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) } if (ca->disk_sb.bdev) { - struct kobject *block = - &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; + struct kobject *block = bdev_kobj(ca->disk_sb.bdev); ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); if (ret) return ret; + ret = sysfs_create_link(&ca->kobj, block, "block"); if (ret) return ret; @@ -1640,24 +1595,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path) struct bch_dev *ca = NULL; struct bch_sb_field_members *mi; struct bch_member dev_mi; - struct bucket_array *buckets; - struct bucket *g; unsigned dev_idx, nr_devices, u64s; int ret; ret = bch2_read_super(path, &opts, &sb); - if (ret) + if (ret) { + bch_err(c, "device add error: error reading super: %i", ret); return ret; + } err = bch2_sb_validate(&sb); - if (err) + if (err) { + bch_err(c, "device add error: error validating super: %s", err); return -EINVAL; + } dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; err = bch2_dev_may_add(sb.sb, c); - if (err) + if (err) { + bch_err(c, "device add error: %s", err); return -EINVAL; + } ca = __bch2_dev_alloc(c, &dev_mi); if (!ca) { @@ -1671,38 +1630,27 @@ int bch2_dev_add(struct bch_fs *c, const char *path) return ret; } - /* - * We want to allocate journal on the new device before adding the new - * device to the filesystem because allocating after we attach requires - * spinning up the allocator thread, and the allocator thread requires - * doing btree writes, which if the existing devices are RO isn't going - * to work - * - * So we have to mark where the superblocks are, but marking allocated - * data normally updates the filesystem usage too, so we have to mark, - * allocate the journal, reset all the marks, then remark after we - * attach... - */ - bch2_mark_dev_superblock(NULL, ca, 0); - - err = "journal alloc failed"; ret = bch2_dev_journal_alloc(ca); - if (ret) + if (ret) { + bch_err(c, "device add error: journal alloc failed"); goto err; + } down_write(&c->state_lock); mutex_lock(&c->sb_lock); - err = "insufficient space in new superblock"; ret = bch2_sb_from_fs(c, ca); - if (ret) + if (ret) { + bch_err(c, "device add error: new device superblock too small"); goto err_unlock; + } mi = bch2_sb_get_members(ca->disk_sb.sb); if (!bch2_sb_resize_members(&ca->disk_sb, le32_to_cpu(mi->field.u64s) + sizeof(dev_mi) / sizeof(u64))) { + bch_err(c, "device add error: new device superblock too small"); ret = -ENOSPC; goto err_unlock; } @@ -1715,7 +1663,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) goto have_slot; no_slot: - err = "no slots available in superblock"; + bch_err(c, "device add error: already have maximum number of devices"); ret = -ENOSPC; goto err_unlock; @@ -1724,12 +1672,12 @@ have_slot: u64s = (sizeof(struct bch_sb_field_members) + sizeof(struct bch_member) * nr_devices) / sizeof(u64); - err = "no space in superblock for member info"; - ret = -ENOSPC; - mi = bch2_sb_resize_members(&c->disk_sb, u64s); - if (!mi) + if (!mi) { + bch_err(c, "device add error: no room in superblock for member info"); + ret = -ENOSPC; goto err_unlock; + } /* success: */ @@ -1745,25 +1693,20 @@ have_slot: bch2_dev_usage_journal_reserve(c); - /* - * Clear marks before marking transactionally in the btree, so that - * per-device accounting gets done correctly: - */ - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - for_each_bucket(g, buckets) - atomic64_set(&g->_mark.v, 0); - up_read(&ca->bucket_lock); - - err = "error marking superblock"; ret = bch2_trans_mark_dev_sb(c, ca); - if (ret) + if (ret) { + bch_err(c, "device add error: error marking new superblock: %i", ret); goto err_late; + } + + ca->new_fs_bucket_idx = 0; if (ca->mi.state == BCH_MEMBER_STATE_rw) { ret = __bch2_dev_read_write(c, ca); - if (ret) + if (ret) { + bch_err(c, "device add error: error going RW on new device: %i", ret); goto err_late; + } } up_write(&c->state_lock); @@ -1776,11 +1719,9 @@ err: if (ca) bch2_dev_free(ca); bch2_free_super(&sb); - bch_err(c, "Unable to add device: %s", err); return ret; err_late: up_write(&c->state_lock); - bch_err(c, "Error going rw after adding device: %s", err); return -EINVAL; } @@ -1917,20 +1858,23 @@ err: /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) { - struct block_device *bdev = lookup_bdev(path); struct bch_dev *ca; + dev_t dev; unsigned i; + int ret; - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + ret = lookup_bdev(path, &dev); + if (ret) + return ERR_PTR(ret); - for_each_member_device(ca, c, i) - if (ca->disk_sb.bdev == bdev) + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + if (ca->disk_sb.bdev->bd_dev == dev) goto found; - ca = ERR_PTR(-ENOENT); found: - bdput(bdev); + rcu_read_unlock(); + return ca; } diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 5cee064995af..c3273e9c711d 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -194,9 +194,29 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) return devs; } +static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 b_offset = bucket_to_sector(ca, b); + u64 b_end = bucket_to_sector(ca, b + 1); + unsigned i; + + if (!b) + return true; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + u64 end = offset + (1 << layout->sb_max_size_bits); + + if (!(offset >= b_end || end <= b_offset)) + return true; + } + + return false; +} + struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(uuid_le); -int bch2_congested(void *, int); bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, enum bch_member_state, int); diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 96023f37afea..d8b159a5b7f7 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -29,7 +29,6 @@ struct bch_member_cpu { u16 bucket_size; /* sectors */ u16 group; u8 state; - u8 replacement; u8 discard; u8 data_allowed; u8 durability; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 3f51eda749f0..6d1596322ee2 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -10,6 +10,7 @@ #include "bcachefs.h" #include "alloc_background.h" +#include "alloc_foreground.h" #include "sysfs.h" #include "btree_cache.h" #include "btree_io.h" @@ -131,7 +132,6 @@ do { \ return strtoi_h(buf, &var) ?: (ssize_t) size; \ } while (0) -write_attribute(trigger_journal_flush); write_attribute(trigger_gc); write_attribute(prune_cache); rw_attribute(btree_gc_periodic); @@ -177,7 +177,6 @@ read_attribute(extent_migrate_done); read_attribute(extent_migrate_raced); rw_attribute(discard); -rw_attribute(cache_replacement_policy); rw_attribute(label); rw_attribute(copy_gc_enabled); @@ -267,8 +266,12 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, + enum btree_id id; + u64 nr_uncompressed_extents = 0, nr_compressed_extents = 0, + nr_incompressible_extents = 0, + uncompressed_sectors = 0, + incompressible_sectors = 0, compressed_sectors_compressed = 0, compressed_sectors_uncompressed = 0; int ret; @@ -278,47 +281,72 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret) - if (k.k->type == KEY_TYPE_extent) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + for (id = 0; id < BTREE_ID_NR; id++) { + if (!((1U << id) & BTREE_ID_HAS_PTRS)) + continue; + + for_each_btree_key(&trans, iter, id, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - - extent_for_each_ptr_decode(e, p, entry) { - if (!crc_is_compressed(p.crc)) { - nr_uncompressed_extents++; - uncompressed_sectors += e.k->size; - } else { - nr_compressed_extents++; + bool compressed = false, uncompressed = false, incompressible = false; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + switch (p.crc.compression_type) { + case BCH_COMPRESSION_TYPE_none: + uncompressed = true; + uncompressed_sectors += k.k->size; + break; + case BCH_COMPRESSION_TYPE_incompressible: + incompressible = true; + incompressible_sectors += k.k->size; + break; + default: compressed_sectors_compressed += p.crc.compressed_size; compressed_sectors_uncompressed += p.crc.uncompressed_size; + compressed = true; + break; } - - /* only looking at the first ptr */ - break; } + + if (incompressible) + nr_incompressible_extents++; + else if (uncompressed) + nr_uncompressed_extents++; + else if (compressed) + nr_compressed_extents++; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(&trans, &iter); + } bch2_trans_exit(&trans); + if (ret) return ret; - pr_buf(out, - "uncompressed data:\n" - " nr extents: %llu\n" - " size (bytes): %llu\n" - "compressed data:\n" - " nr extents: %llu\n" - " compressed size (bytes): %llu\n" - " uncompressed size (bytes): %llu\n", - nr_uncompressed_extents, - uncompressed_sectors << 9, - nr_compressed_extents, - compressed_sectors_compressed << 9, - compressed_sectors_uncompressed << 9); + pr_buf(out, "uncompressed:\n"); + pr_buf(out, " nr extents: %llu\n", nr_uncompressed_extents); + pr_buf(out, " size: "); + bch2_hprint(out, uncompressed_sectors << 9); + pr_buf(out, "\n"); + + pr_buf(out, "compressed:\n"); + pr_buf(out, " nr extents: %llu\n", nr_compressed_extents); + pr_buf(out, " compressed size: "); + bch2_hprint(out, compressed_sectors_compressed << 9); + pr_buf(out, "\n"); + pr_buf(out, " uncompressed size: "); + bch2_hprint(out, compressed_sectors_uncompressed << 9); + pr_buf(out, "\n"); + + pr_buf(out, "incompressible:\n"); + pr_buf(out, " nr extents: %llu\n", nr_incompressible_extents); + pr_buf(out, " size: "); + bch2_hprint(out, incompressible_sectors << 9); + pr_buf(out, "\n"); return 0; } @@ -483,9 +511,6 @@ STORE(bch2_fs) /* Debugging: */ - if (attr == &sysfs_trigger_journal_flush) - bch2_journal_meta(&c->journal); - if (attr == &sysfs_trigger_gc) { /* * Full gc is currently incompatible with btree key cache: @@ -575,7 +600,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_io_timers_read, &sysfs_io_timers_write, - &sysfs_trigger_journal_flush, &sysfs_trigger_gc, &sysfs_prune_cache, @@ -626,7 +650,7 @@ STORE(bch2_fs_opts_dir) if (!tmp) return -ENOMEM; - ret = bch2_opt_parse(c, opt, strim(tmp), &v); + ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v); kfree(tmp); if (ret < 0) @@ -636,13 +660,7 @@ STORE(bch2_fs_opts_dir) if (ret < 0) return ret; - if (opt->set_sb != SET_NO_SB_OPT) { - mutex_lock(&c->sb_lock); - opt->set_sb(c->disk_sb.sb, v); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - + bch2_opt_set_sb(c, opt, v); bch2_opt_set_by_id(&c->opts, id, v); if ((id == Opt_background_target || @@ -665,7 +683,7 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj) for (i = bch2_opt_table; i < bch2_opt_table + bch2_opts_nr; i++) { - if (!(i->mode & OPT_FS)) + if (!(i->flags & OPT_FS)) continue; ret = sysfs_create_file(kobj, &i->attr); @@ -735,7 +753,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) memset(nr, 0, sizeof(nr)); for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) - nr[c->open_buckets[i].type]++; + nr[c->open_buckets[i].data_type]++; pr_buf(out, "\t\t buckets\t sectors fragmented\n" @@ -832,14 +850,6 @@ SHOW(bch2_dev) return out.pos - buf; } - if (attr == &sysfs_cache_replacement_policy) { - bch2_string_opt_to_text(&out, - bch2_cache_replacement_policies, - ca->mi.replacement); - pr_buf(&out, "\n"); - return out.pos - buf; - } - if (attr == &sysfs_state_rw) { bch2_string_opt_to_text(&out, bch2_member_states, ca->mi.state); @@ -899,22 +909,6 @@ STORE(bch2_dev) mutex_unlock(&c->sb_lock); } - if (attr == &sysfs_cache_replacement_policy) { - ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); - - if (v < 0) - return v; - - mutex_lock(&c->sb_lock); - mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; - - if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { - SET_BCH_MEMBER_REPLACEMENT(mi, v); - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); - } - if (attr == &sysfs_label) { char *tmp; int ret; @@ -945,7 +939,6 @@ struct attribute *bch2_dev_files[] = { /* settings: */ &sysfs_discard, - &sysfs_cache_replacement_policy, &sysfs_state_rw, &sysfs_label, diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 478c00a5c975..60ccb94e5de5 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -618,7 +618,6 @@ static int rand_mixed(struct bch_fs *c, u64 nr) static int __do_delete(struct btree_trans *trans, struct bpos pos) { struct btree_iter iter; - struct bkey_i delete; struct bkey_s_c k; int ret = 0; @@ -632,10 +631,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) if (!k.k) goto err; - bkey_init(&delete.k); - delete.k.p = k.k->p; - - ret = bch2_trans_update(trans, &iter, &delete, 0); + ret = bch2_btree_delete_at(trans, &iter, 0); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -650,7 +646,7 @@ static int rand_delete(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); for (i = 0; i < nr; i++) { - struct bpos pos = POS(0, test_rand()); + struct bpos pos = SPOS(0, test_rand(), U32_MAX); ret = __bch2_trans_do(&trans, NULL, NULL, 0, __do_delete(&trans, pos)); diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 52de7c49cacb..0bbea332fcaa 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -114,7 +114,7 @@ void bch2_hprint(struct printbuf *buf, s64 v) * 103 is magic: t is in the range [-1023, 1023] and we want * to turn it into [-9, 9] */ - if (u && v < 100 && v > -100) + if (u && t && v < 100 && v > -100) pr_buf(buf, ".%i", t / 103); if (u) pr_buf(buf, "%c", si_units[u]); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 61c28595585b..80402b398442 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -18,9 +18,6 @@ #include <linux/vmalloc.h> #include <linux/workqueue.h> -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - 9) -#define PAGE_SECTORS (1UL << PAGE_SECTORS_SHIFT) - struct closure; #ifdef CONFIG_BCACHEFS_DEBUG @@ -88,7 +85,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask) { return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, get_order(size)) ?: - __vmalloc(size, gfp_mask, PAGE_KERNEL); + __vmalloc(size, gfp_mask); } static inline void kvpfree(void *p, size_t size) @@ -653,35 +650,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -static inline struct bio_vec next_contig_bvec(struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - bio_advance_iter(bio, iter, bv.bv_len); -#ifndef CONFIG_HIGHMEM - while (iter->bi_size) { - struct bio_vec next = bio_iter_iovec(bio, *iter); - - if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != - page_address(next.bv_page) + next.bv_offset) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter(bio, iter, next.bv_len); - } -#endif - return bv; -} - -#define __bio_for_each_contig_segment(bv, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bv = next_contig_bvec((bio), &(iter))), 1);) - -#define bio_for_each_contig_segment(bv, bio, iter) \ - __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) - void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 97ed56d87ac5..4d7db64e3ef3 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -359,6 +359,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, } static int bch2_xattr_set_handler(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *vinode, const char *name, const void *value, size_t size, int flags) @@ -491,6 +492,7 @@ static int inode_opt_set_fn(struct bch_inode_info *inode, } static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *vinode, const char *name, const void *value, size_t size, int flags) @@ -523,7 +525,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, memcpy(buf, value, size); buf[size] = '\0'; - ret = bch2_opt_parse(c, opt, buf, &v); + ret = bch2_opt_parse(c, NULL, opt, buf, &v); kfree(buf); if (ret < 0) diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index e314877ef174..5a409ee19d93 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -71,10 +71,10 @@ DECLARE_EVENT_CLASS(bio, ), TP_fast_assign( - __entry->dev = bio->bi_disk ? bio_dev(bio) : 0; + __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u", @@ -318,6 +318,34 @@ DEFINE_EVENT(btree_node, btree_set_root, TP_ARGS(c, b) ); +TRACE_EVENT(btree_cache_scan, + TP_PROTO(unsigned long nr_to_scan_pages, + unsigned long nr_to_scan_nodes, + unsigned long can_free_nodes, + long ret), + TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret), + + TP_STRUCT__entry( + __field(unsigned long, nr_to_scan_pages ) + __field(unsigned long, nr_to_scan_nodes ) + __field(unsigned long, can_free_nodes ) + __field(long, ret ) + ), + + TP_fast_assign( + __entry->nr_to_scan_pages = nr_to_scan_pages; + __entry->nr_to_scan_nodes = nr_to_scan_nodes; + __entry->can_free_nodes = can_free_nodes; + __entry->ret = ret; + ), + + TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li", + __entry->nr_to_scan_pages, + __entry->nr_to_scan_nodes, + __entry->can_free_nodes, + __entry->ret) +); + /* Garbage collection */ DEFINE_EVENT(btree_node, btree_gc_rewrite_node, |