diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2020-09-07 14:13:00 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2020-09-07 14:13:00 -0400 |
commit | 4e21b048c317fac4ca43eb7cdcf8918f84dec12a (patch) | |
tree | b6c13ff97bb697d0c795543096cc87fee92b4c17 | |
parent | f54540c1c0413ebb280b68c4aa2d68ab8ba6b70e (diff) |
Merge with fb2821e726bcachefs: Don't fail mount if device has been removed
32 files changed, 274 insertions, 423 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 43b9f99194b9..9aa0b42b26b6 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -350,6 +350,8 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) bch2_btree_iter_set_pos(iter, POS(i, first_bucket)); while (1) { + bch2_trans_cond_resched(&trans); + ret = bch2_alloc_write_key(&trans, iter, flags); if (ret < 0 || ret == ALLOC_END) break; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 06bb267e94f1..3a5a00e53cbf 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -202,7 +202,8 @@ #include "opts.h" #include "util.h" -#include <linux/dynamic_fault.h> +#define dynamic_fault(...) 0 +#define race_fault(...) 0 #define bch2_fs_init_fault(name) \ dynamic_fault("bcachefs:bch_fs_init:" name) @@ -734,7 +735,7 @@ struct bch_fs { ZSTD_parameters zstd_params; struct crypto_shash *sha256; - struct crypto_skcipher *chacha20; + struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index a0d570f3adf0..736671112861 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -73,10 +73,6 @@ static const struct rhashtable_params bch_btree_cache_params = { .obj_cmpfn = bch2_btree_cache_cmp_fn, }; -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); @@ -85,8 +81,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->data) return -ENOMEM; - b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, - PAGE_KERNEL_EXEC); + b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); if (!b->aux_data) { kvpfree(b->data, btree_bytes(c)); b->data = NULL; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 7c3fb5fb0cca..2f5097218f9c 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -597,34 +597,6 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, bch2_btree_iter_reinit_node(iter, b); } -static struct nonce btree_nonce(struct bset *i, unsigned offset) -{ - return (struct nonce) {{ - [0] = cpu_to_le32(offset), - [1] = ((__le32 *) &i->seq)[0], - [2] = ((__le32 *) &i->seq)[1], - [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, - }}; -} - -static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) -{ - struct nonce nonce = btree_nonce(i, offset); - - if (!offset) { - struct btree_node *bn = container_of(i, struct btree_node, keys); - unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; - - bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, - bytes); - - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); - } - - bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, - vstruct_end(i) - (void *) i->_data); -} - static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct btree *b, struct bset *i, unsigned offset, int write) diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 66ebdd39f5b3..626d0f071b70 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -5,6 +5,7 @@ #include "bkey_methods.h" #include "bset.h" #include "btree_locking.h" +#include "checksum.h" #include "extents.h" #include "io_types.h" @@ -82,6 +83,34 @@ static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree * return false; } +static inline struct nonce btree_nonce(struct bset *i, unsigned offset) +{ + return (struct nonce) {{ + [0] = cpu_to_le32(offset), + [1] = ((__le32 *) &i->seq)[0], + [2] = ((__le32 *) &i->seq)[1], + [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, + }}; +} + +static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) +{ + struct nonce nonce = btree_nonce(i, offset); + + if (!offset) { + struct btree_node *bn = container_of(i, struct btree_node, keys); + unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; + + bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, + bytes); + + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); + } + + bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, + vstruct_end(i) - (void *) i->_data); +} + void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); void bch2_btree_build_aux_trees(struct btree *); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index d73cc8ddadac..61662750dfc0 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -391,7 +391,7 @@ static void btree_key_cache_journal_flush(struct journal *j, struct btree_trans trans; six_lock_read(&ck->c.lock, NULL, NULL); - key = READ_ONCE(ck->key); + key = ck->key; if (ck->journal.seq != seq || !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 4ebe80b05ffc..d5215b14d7d9 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -125,6 +125,7 @@ struct disk_reservation { struct copygc_heap_entry { u8 dev; u8 gen; + u16 fragmentation; u32 sectors; u64 offset; }; diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index a01073e54a33..3d88719ba86c 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -10,7 +10,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <crypto/skcipher.h> @@ -68,21 +68,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t } } -static inline void do_encrypt_sg(struct crypto_skcipher *tfm, +static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_skcipher *tfm, +static inline void do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -95,8 +95,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm, int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { - struct crypto_skcipher *chacha20 = - crypto_alloc_skcipher("chacha20", 0, 0); + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; if (!chacha20) { @@ -104,7 +104,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, return PTR_ERR(chacha20); } - ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); if (ret) { pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; @@ -112,7 +113,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_skcipher(chacha20); + crypto_free_sync_skcipher(chacha20); return ret; } @@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crc = bch2_checksum_update(type, crc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crypto_shash_update(desc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -463,7 +464,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) { bch_err(c, "error requesting chacha20 module: %li", PTR_ERR(c->chacha20)); @@ -546,7 +547,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -574,7 +575,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_skcipher(c->chacha20); + crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -606,7 +607,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto out; - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 833537cc8fd0..24dee8039d57 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -7,7 +7,7 @@ #include "super-io.h" #include <linux/crc64.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> static inline bool bch2_checksum_mergeable(unsigned type) { @@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 47838fd2db06..b50d2b0d5fd3 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) struct bvec_iter iter; void *expected_start = NULL; - __bio_for_each_segment(bv, bio, iter, start) { + __bio_for_each_bvec(bv, bio, iter, start) { if (expected_start && expected_start != page_address(bv.bv_page) + bv.bv_offset) return false; diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 4a4ec8f46108..c52b6faac9b4 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -183,7 +183,7 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe case TARGET_GROUP: { struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - return t.group < g->nr && !g->entries[t.group].deleted + return g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; } @@ -208,7 +208,7 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) rcu_read_lock(); g = rcu_dereference(c->disk_groups); - m = t.group < g->nr && !g->entries[t.group].deleted + m = g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; @@ -387,6 +387,7 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) { struct bch_member *mi; int v = -1; + int ret = 0; mutex_lock(&c->sb_lock); @@ -399,14 +400,18 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) return v; } + ret = bch2_sb_disk_groups_to_cpu(c); + if (ret) + goto unlock; write_sb: mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; SET_BCH_MEMBER_GROUP(mi, v + 1); bch2_write_super(c); +unlock: mutex_unlock(&c->sb_lock); - return 0; + return ret; } int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h index c8e0c37a5e1a..3d84f23c34ed 100644 --- a/fs/bcachefs/disk_groups.h +++ b/fs/bcachefs/disk_groups.h @@ -71,7 +71,10 @@ static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); int bch2_disk_path_find(struct bch_sb_handle *, const char *); + +/* Exported for userspace bcachefs-tools: */ int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); + void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, unsigned); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 425b0b806cee..5514f65378ad 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1594,7 +1594,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) mutex_unlock(&c->ec_stripe_head_lock); mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry(h, &c->ec_stripe_new_list, list) { + list_for_each_entry(s, &c->ec_stripe_new_list, list) { pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", s->blocks.nr, bitmap_weight(s->blocks_allocated, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 2d08263f3a42..55004998536d 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -603,7 +603,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(newpage)); - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; @@ -628,10 +628,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, static void bch2_readpages_end_io(struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -783,11 +783,8 @@ static void readpage_bio_extend(struct readpages_iter *iter, if (!get_more) break; - rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); - rcu_read_unlock(); - - if (page && !radix_tree_exceptional_entry(page)) + page = xa_load(&iter->mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) break; page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); @@ -1038,32 +1035,33 @@ static void bch2_writepage_io_done(struct closure *cl) struct bch_writepage_io, cl); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i, j; + unsigned i; if (io->op.error) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; SetPageError(bvec->bv_page); - mapping_set_error(io->inode->v.i_mapping, -EIO); + mapping_set_error(bvec->bv_page->mapping, -EIO); s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } @@ -1087,7 +1085,7 @@ static void bch2_writepage_io_done(struct closure *cl) */ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s = __bch2_page_state(bvec->bv_page); if (atomic_dec_and_test(&s->write_count)) @@ -1241,7 +1239,7 @@ do_io: if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio) || + bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= (BIO_MAX_PAGES * PAGE_SIZE) || bio_end_sector(&w->io->op.wbio.bio) != sector)) @@ -1810,8 +1808,9 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct bch_inode_info *inode = file_bch_inode(req->ki_filp); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i, unaligned; + unsigned unaligned; bool sync = dio->sync; long ret; @@ -1820,7 +1819,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) while (1) { if (kthread) - use_mm(dio->mm); + kthread_use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; @@ -1828,7 +1827,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) current->faults_disabled_mapping = NULL; if (kthread) - unuse_mm(dio->mm); + kthread_unuse_mm(dio->mm); if (unlikely(ret < 0)) goto err; @@ -1842,7 +1841,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) * bio_iov_iter_get_pages was only able to get < * blocksize worth of pages: */ - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); ret = -EFAULT; goto err; @@ -1905,7 +1904,7 @@ loop: i_size_write(&inode->v, req->ki_pos); spin_unlock(&inode->v.i_lock); - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); if (!dio->iter.count || dio->op.error) break; @@ -2816,235 +2815,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode, } while (index <= end_index); } -static int generic_access_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - struct inode *inode = file->f_mapping->host; - loff_t max_size = inode->i_sb->s_maxbytes; - - if (!(file->f_flags & O_LARGEFILE)) - max_size = MAX_NON_LFS; - - if (unlikely(pos >= max_size)) - return -EFBIG; - *count = min(*count, max_size - pos); - return 0; -} - -static int generic_write_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - loff_t limit = rlimit(RLIMIT_FSIZE); - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - *count = min(*count, limit - pos); - } - - return generic_access_check_limits(file, pos, count); -} - -static int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) -{ - struct inode *inode_in = file_in->f_mapping->host; - struct inode *inode_out = file_out->f_mapping->host; - uint64_t count = *req_count; - uint64_t bcount; - loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; - int ret; - - /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) - return -EINVAL; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EINVAL; - - size_in = i_size_read(inode_in); - size_out = i_size_read(inode_out); - - /* Dedupe requires both ranges to be within EOF. */ - if ((remap_flags & REMAP_FILE_DEDUP) && - (pos_in >= size_in || pos_in + count > size_in || - pos_out >= size_out || pos_out + count > size_out)) - return -EINVAL; - - /* Ensure the infile range is within the infile. */ - if (pos_in >= size_in) - return -EINVAL; - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_access_check_limits(file_in, pos_in, &count); - if (ret) - return ret; - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* - * If the user wanted us to link to the infile's EOF, round up to the - * next block boundary for this check. - * - * Otherwise, make sure the count is also block-aligned, having - * already confirmed the starting offsets' block alignment. - */ - if (pos_in + count == size_in) { - bcount = ALIGN(size_in, bs) - pos_in; - } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); - bcount = count; - } - - /* Don't allow overlapped cloning within the same file. */ - if (inode_in == inode_out && - pos_out + bcount > pos_in && - pos_out < pos_in + bcount) - return -EINVAL; - - /* - * We shortened the request but the caller can't deal with that, so - * bounce the request back to userspace. - */ - if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) - return -EINVAL; - - *req_count = count; - return 0; -} - -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if ((remap_flags & REMAP_FILE_DEDUP) || - pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); - int ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) { - /* Update the timestamps, since we can alter file contents. */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - return ret; - } - - /* - * Clear the security bits if the process is not being run by - * root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - return ret; - } - - return 0; -} - loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) @@ -3240,7 +3010,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) loff_t ret = -1; page = find_lock_entry(mapping, index); - if (!page || radix_tree_exception(page)) + if (!page || xa_is_value(page)) return offset; pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index 1b593ea707d5..7063556d289b 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -35,10 +35,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); int bch2_truncate(struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -#define REMAP_FILE_ADVISORY (0) -#define REMAP_FILE_DEDUP (1 << 0) -#define REMAP_FILE_CAN_SHORTEN (1 << 1) - loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t, loff_t, unsigned); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 031e6d931171..0873d2f0928c 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -138,6 +138,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, if (fa.fsx_projid >= U32_MAX) return -EINVAL; + /* + * inode fields accessible via the xattr interface are stored with a +1 + * bias, so that 0 means unset: + */ s.projid = fa.fsx_projid + 1; ret = mnt_want_write_file(file); @@ -151,7 +155,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = bch2_set_projid(c, inode, s.projid); + ret = bch2_set_projid(c, inode, fa.fsx_projid); if (ret) goto err_unlock; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index ba73e5258e8d..e504e6b19abe 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -25,6 +25,7 @@ #include <linux/aio.h> #include <linux/backing-dev.h> #include <linux/exportfs.h> +#include <linux/fiemap.h> #include <linux/module.h> #include <linux/posix_acl.h> #include <linux/random.h> @@ -860,6 +861,10 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bool have_extent = false; int ret = 0; + ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + if (start + len < start) return -EINVAL; @@ -966,15 +971,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) return bch2_readdir(c, inode->v.i_ino, ctx); } -static int bch2_clone_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - u64 len) -{ - return bch2_remap_file_range(file_src, pos_src, - file_dst, pos_dst, - len, 0); -} - static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, .read_iter = bch2_read_iter, @@ -992,7 +988,7 @@ static const struct file_operations bch_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif - .clone_file_range = bch2_clone_file_range, + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { @@ -1245,8 +1241,8 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = usage.capacity >> shift; buf->f_bfree = (usage.capacity - usage.used) >> shift; buf->f_bavail = buf->f_bfree; - buf->f_files = usage.nr_inodes; - buf->f_ffree = U64_MAX; + buf->f_files = 0; + buf->f_ffree = 0; fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); @@ -1410,6 +1406,24 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) return ret; } +static int bch2_show_devname(struct seq_file *seq, struct dentry *root) +{ + struct bch_fs *c = root->d_sb->s_fs_info; + struct bch_dev *ca; + unsigned i; + bool first = true; + + for_each_online_member(ca, c, i) { + if (!first) + seq_putc(seq, ':'); + first = false; + seq_puts(seq, "/dev/"); + seq_puts(seq, ca->name); + } + + return 0; +} + static int bch2_show_options(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; @@ -1433,7 +1447,6 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) } return 0; - } static const struct super_operations bch_super_operations = { @@ -1443,6 +1456,7 @@ static const struct super_operations bch_super_operations = { .evict_inode = bch2_evict_inode, .sync_fs = bch2_sync_fs, .statfs = bch2_statfs, + .show_devname = bch2_show_devname, .show_options = bch2_show_options, .remount_fs = bch2_remount, #if 0 @@ -1523,7 +1537,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, sb->s_bdi->congested_fn = bch2_congested; sb->s_bdi->congested_data = c; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index c6ca5968a2e0..5a6df3d1973a 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1265,6 +1265,8 @@ static int check_inode(struct btree_trans *trans, u.bi_inum))) { bch_verbose(c, "deleting inode %llu", u.bi_inum); + bch2_fs_lazy_rw(c); + ret = bch2_inode_rm(c, u.bi_inum); if (ret) bch_err(c, "error in fsck: error %i while deleting inode", ret); @@ -1277,6 +1279,8 @@ static int check_inode(struct btree_trans *trans, u.bi_inum))) { bch_verbose(c, "truncating inode %llu", u.bi_inum); + bch2_fs_lazy_rw(c); + /* * XXX: need to truncate partial blocks too here - or ideally * just switch units to bytes and that issue goes away diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 4fad37fdee25..5c9c3cf54edd 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -54,7 +54,9 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) return false; rcu_read_lock(); - devs = bch2_target_to_mask(c, target); + devs = bch2_target_to_mask(c, target) ?: + &c->rw_devs[BCH_DATA_user]; + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { ca = rcu_dereference(c->devs[d]); if (!ca) @@ -132,10 +134,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -471,7 +473,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->c = c; n->dev = ptr->dev; - n->have_ioref = bch2_dev_get_ioref(ca, WRITE); + n->have_ioref = bch2_dev_get_ioref(ca, + type == BCH_DATA_btree ? READ : WRITE); n->submit_time = local_clock(); n->bio.bi_iter.bi_sector = ptr->offset; @@ -1091,6 +1094,11 @@ again: goto err; } + /* + * The copygc thread is now global, which means it's no longer + * freeing up space on specific disks, which means that + * allocations for specific disks may hang arbitrarily long: + */ wp = bch2_alloc_sectors_start(c, op->target, op->opts.erasure_code, @@ -1100,7 +1108,8 @@ again: op->nr_replicas_required, op->alloc_reserve, op->flags, - (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); + (op->flags & (BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); EBUG_ON(!wp); if (unlikely(IS_ERR(wp))) { diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 1dde0b5d963f..56438840efd7 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -281,7 +281,7 @@ static inline void bch2_journal_res_put(struct journal *j, if (!res->ref) return; - lock_release(&j->res_map, 0, _THIS_IP_); + lock_release(&j->res_map, _THIS_IP_); while (res->u64s) bch2_journal_add_entry(j, res, diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 89585833c846..bd0e6b371701 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -29,9 +29,11 @@ struct journal_list { * be replayed: */ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, - struct journal_list *jlist, struct jset *j) + struct journal_list *jlist, struct jset *j, + bool bad) { struct journal_replay *i, *pos; + struct bch_devs_list devs = { .nr = 0 }; struct list_head *where; size_t bytes = vstruct_bytes(j); __le64 last_seq; @@ -60,8 +62,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, } list_for_each_entry_reverse(i, jlist->head, list) { - /* Duplicate? */ - if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { + if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { + where = &i->list; + goto add; + } + } + + where = jlist->head; +add: + i = where->next != jlist->head + ? container_of(where->next, struct journal_replay, list) + : NULL; + + /* + * Duplicate journal entries? If so we want the one that didn't have a + * checksum error: + */ + if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { + if (i->bad) { + devs = i->devs; + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); + } else if (bad) { + goto found; + } else { fsck_err_on(bytes != vstruct_bytes(&i->j) || memcmp(j, &i->j, bytes), c, "found duplicate but non identical journal entries (seq %llu)", @@ -69,14 +94,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, goto found; } - if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { - where = &i->list; - goto add; - } } - where = jlist->head; -add: i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) { ret = -ENOMEM; @@ -84,7 +103,8 @@ add: } list_add(&i->list, where); - i->devs.nr = 0; + i->devs = devs; + i->bad = bad; memcpy(&i->j, j, bytes); found: if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) @@ -391,6 +411,7 @@ fsck_err: } static int jset_validate(struct bch_fs *c, + struct bch_dev *ca, struct jset *jset, u64 sector, unsigned bucket_sectors_left, unsigned sectors_read, @@ -405,16 +426,19 @@ static int jset_validate(struct bch_fs *c, return JOURNAL_ENTRY_NONE; version = le32_to_cpu(jset->version); - if ((version != BCH_JSET_VERSION_OLD && - version < bcachefs_metadata_version_min) || - version >= bcachefs_metadata_version_max) { - bch_err(c, "unknown journal entry version %u", jset->version); - return BCH_FSCK_UNKNOWN_VERSION; + if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && + version < bcachefs_metadata_version_min) || + version >= bcachefs_metadata_version_max, c, + "%s sector %llu seq %llu: unknown journal entry version %u", + ca->name, sector, le64_to_cpu(jset->seq), + version)) { + /* XXX: note we might have missing journal entries */ + return JOURNAL_ENTRY_BAD; } if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, - "journal entry too big (%zu bytes), sector %lluu", - bytes, sector)) { + "%s sector %llu seq %llu: journal entry too big (%zu bytes)", + ca->name, sector, le64_to_cpu(jset->seq), bytes)) { /* XXX: note we might have missing journal entries */ return JOURNAL_ENTRY_BAD; } @@ -423,13 +447,15 @@ static int jset_validate(struct bch_fs *c, return JOURNAL_ENTRY_REREAD; if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, - "journal entry with unknown csum type %llu sector %lluu", - JSET_CSUM_TYPE(jset), sector)) + "%s sector %llu seq %llu: journal entry with unknown csum type %llu", + ca->name, sector, le64_to_cpu(jset->seq), + JSET_CSUM_TYPE(jset))) return JOURNAL_ENTRY_BAD; csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, - "journal checksum bad, sector %llu", sector)) { + "%s sector %llu seq %llu: journal checksum bad", + ca->name, sector, le64_to_cpu(jset->seq))) { /* XXX: retry IO, when we start retrying checksum errors */ /* XXX: note we might have missing journal entries */ return JOURNAL_ENTRY_BAD; @@ -440,8 +466,10 @@ static int jset_validate(struct bch_fs *c, vstruct_end(jset) - (void *) jset->encrypted_start); if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, - "invalid journal entry: last_seq > seq")) + "invalid journal entry: last_seq > seq")) { jset->last_seq = jset->seq; + return JOURNAL_ENTRY_BAD; + } return 0; fsck_err: @@ -516,11 +544,12 @@ reread: j = buf->data; } - ret = jset_validate(c, j, offset, + ret = jset_validate(c, ca, j, offset, end - offset, sectors_read, READ); switch (ret) { case BCH_FSCK_OK: + sectors = vstruct_sectors(j, c->block_bits); break; case JOURNAL_ENTRY_REREAD: if (vstruct_bytes(j) > buf->size) { @@ -537,8 +566,13 @@ reread: goto next_block; case JOURNAL_ENTRY_BAD: saw_bad = true; + /* + * On checksum error we don't really trust the size + * field of the journal entry we read, so try reading + * again at next block boundary: + */ sectors = c->opts.block_size; - goto next_block; + break; default: return ret; } @@ -555,7 +589,7 @@ reread: ja->bucket_seq[bucket] = le64_to_cpu(j->seq); mutex_lock(&jlist->lock); - ret = journal_entry_add(c, ca, jlist, j); + ret = journal_entry_add(c, ca, jlist, j, ret != 0); mutex_unlock(&jlist->lock); switch (ret) { @@ -566,8 +600,6 @@ reread: default: return ret; } - - sectors = vstruct_sectors(j, c->block_bits); next_block: pr_debug("next"); offset += sectors; diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 72e575f360af..6958ee0f8cf2 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -9,6 +9,8 @@ struct journal_replay { struct list_head list; struct bch_devs_list devs; + /* checksum error, but we may want to try using it anyways: */ + bool bad; /* must be last: */ struct jset j; }; diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index a21de0088753..d0f1bbf8f6a7 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -36,15 +36,6 @@ * that bset, until that btree node is rewritten. */ -static unsigned -blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) -{ - return bl - ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / - sizeof(struct journal_seq_blacklist_entry)) - : 0; -} - static unsigned sb_blacklist_u64s(unsigned nr) { struct bch_sb_field_journal_seq_blacklist *bl; diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index 03f4b97247fd..afb886ec8e25 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -2,6 +2,15 @@ #ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H #define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H +static inline unsigned +blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) +{ + return bl + ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / + sizeof(struct journal_seq_blacklist_entry)) + : 0; +} + bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); int bch2_blacklist_table_initialize(struct bch_fs *); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 4a2c4debd3f0..2f3be487ef65 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -320,12 +320,12 @@ static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) if (bv->bv_page) __free_page(bv->bv_page); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 55aa463f992f..de0a7974ec9f 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -44,13 +44,6 @@ #define COPYGC_BUCKETS_PER_ITER(ca) \ ((ca)->free[RESERVE_MOVINGGC].size / 2) -static inline int sectors_used_cmp(copygc_heap *heap, - struct copygc_heap_entry l, - struct copygc_heap_entry r) -{ - return cmp_int(l.sectors, r.sectors); -} - static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) { const struct copygc_heap_entry *l = _l; @@ -123,6 +116,13 @@ static bool have_copygc_reserve(struct bch_dev *ca) return ret; } +static inline int fragmentation_cmp(copygc_heap *heap, + struct copygc_heap_entry l, + struct copygc_heap_entry r) +{ + return cmp_int(l.fragmentation, r.fragmentation); +} + static int bch2_copygc(struct bch_fs *c) { copygc_heap *h = &c->copygc_heap; @@ -180,10 +180,12 @@ static int bch2_copygc(struct bch_fs *c) e = (struct copygc_heap_entry) { .dev = dev_idx, .gen = m.gen, + .fragmentation = bucket_sectors_used(m) * (1U << 15) + / ca->mi.bucket_size, .sectors = bucket_sectors_used(m), .offset = bucket_to_sector(ca, b), }; - heap_add_or_replace(h, e, -sectors_used_cmp, NULL); + heap_add_or_replace(h, e, -fragmentation_cmp, NULL); } up_read(&ca->bucket_lock); } @@ -197,7 +199,7 @@ static int bch2_copygc(struct bch_fs *c) sectors_to_move += i->sectors; while (sectors_to_move > sectors_reserved) { - BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL)); + BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); sectors_to_move -= e.sectors; } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index d6a832a38b20..014c608ca0c6 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -83,7 +83,7 @@ enum opt_type { "size", NULL) \ x(btree_node_size, u16, \ OPT_FORMAT, \ - OPT_SECTORS(1, 128), \ + OPT_SECTORS(1, 512), \ BCH_SB_BTREE_NODE_SIZE, 512, \ "size", "Btree node size, default 256k") \ x(errors, u8, \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 28972f30e198..6e829bf0a31f 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1039,6 +1039,11 @@ int bch2_fs_recovery(struct bch_fs *c) } journal_seq += 4; + + /* + * The superblock needs to be written before we do any btree + * node writes: it will be in the read_write() path + */ } ret = bch2_blacklist_table_initialize(c); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 1d9a6bfa8c13..30be083b09bf 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -169,10 +169,9 @@ int bch2_congested(void *data, int bdi_bits) } } } else { - unsigned target = READ_ONCE(c->opts.foreground_target); - const struct bch_devs_mask *devs = target - ? bch2_target_to_mask(c, target) - : &c->rw_devs[BCH_DATA_user]; + const struct bch_devs_mask *devs = + bch2_target_to_mask(c, c->opts.foreground_target) ?: + &c->rw_devs[BCH_DATA_user]; for_each_member_device_rcu(ca, c, i, devs) { bdi = ca->disk_sb.bdev->bd_bdi; @@ -384,8 +383,8 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) { bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); - bch2_fs_read_only_async(c); bch2_journal_halt(&c->journal); + bch2_fs_read_only_async(c); wake_up(&bch_read_only_wait); return ret; @@ -442,6 +441,13 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (ret) goto err; + /* + * We need to write out a journal entry before we start doing btree + * updates, to ensure that on unclean shutdown new journal blacklist + * entries are created: + */ + bch2_journal_meta(&c->journal); + clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); for_each_rw_member(ca, c, i) @@ -1820,7 +1826,6 @@ err: /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) { - struct block_device *bdev = lookup_bdev(path); struct bch_dev *ca; unsigned i; @@ -1845,6 +1850,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, { struct bch_sb_handle *sb = NULL; struct bch_fs *c = NULL; + struct bch_sb_field_members *mi; unsigned i, best_sb = 0; const char *err; int ret = -ENOMEM; @@ -1880,10 +1886,24 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, le64_to_cpu(sb[best_sb].sb->seq)) best_sb = i; - for (i = 0; i < nr_devices; i++) { + mi = bch2_sb_get_members(sb[best_sb].sb); + + i = 0; + while (i < nr_devices) { + if (i != best_sb && + !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) { + char buf[BDEVNAME_SIZE]; + pr_info("%s has been removed, skipping", + bdevname(sb[i].bdev, buf)); + bch2_free_super(&sb[i]); + array_remove_item(sb, nr_devices, i); + continue; + } + err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); if (err) goto err_print; + i++; } ret = -ENOMEM; diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 4aa5dd7917cf..fffee96726ce 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -222,6 +222,15 @@ void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); int bch2_fs_read_write_early(struct bch_fs *); +/* + * Only for use in the recovery/fsck path: + */ +static inline void bch2_fs_lazy_rw(struct bch_fs *c) +{ + if (percpu_ref_is_zero(&c->writes)) + bch2_fs_read_write_early(c); +} + void bch2_fs_stop(struct bch_fs *); int bch2_fs_start(struct bch_fs *); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 119c86122023..f48c6380684f 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -99,7 +99,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask) { return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, get_order(size)) ?: - __vmalloc(size, gfp_mask, PAGE_KERNEL); + __vmalloc(size, gfp_mask); } static inline void kvpfree(void *p, size_t size) @@ -664,35 +664,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -static inline struct bio_vec next_contig_bvec(struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - bio_advance_iter(bio, iter, bv.bv_len); -#ifndef CONFIG_HIGHMEM - while (iter->bi_size) { - struct bio_vec next = bio_iter_iovec(bio, *iter); - - if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != - page_address(next.bv_page) + next.bv_offset) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter(bio, iter, next.bv_len); - } -#endif - return bv; -} - -#define __bio_for_each_contig_segment(bv, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bv = next_contig_bvec((bio), &(iter))), 1);) - -#define bio_for_each_contig_segment(bv, bio, iter) \ - __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) - void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 725a6f3ef8ce..21f64cb7e402 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -511,7 +511,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, mutex_lock(&inode->ei_update_lock); if (inode_opt_id == Inode_opt_project) { - ret = bch2_set_projid(c, inode, s.v); + /* + * inode fields accessible via the xattr interface are stored + * with a +1 bias, so that 0 means unset: + */ + ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); if (ret) goto err; } |