Merge with fb2821e726bcachefs: Don't fail mount if device has been removed

author: Kent Overstreet <kent.overstreet@gmail.com> 2020-09-07 14:13:00 -0400
committer: Kent Overstreet <kent.overstreet@gmail.com> 2020-09-07 14:13:00 -0400
commit: 4e21b048c317fac4ca43eb7cdcf8918f84dec12a (patch)
tree: b6c13ff97bb697d0c795543096cc87fee92b4c17
parent: f54540c1c0413ebb280b68c4aa2d68ab8ba6b70e (diff)
32 files changed, 274 insertions, 423 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 43b9f99194b9..9aa0b42b26b6 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -350,6 +350,8 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 		bch2_btree_iter_set_pos(iter, POS(i, first_bucket));
 
 		while (1) {
+			bch2_trans_cond_resched(&trans);
+
 			ret = bch2_alloc_write_key(&trans, iter, flags);
 			if (ret < 0 || ret == ALLOC_END)
 				break;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 06bb267e94f1..3a5a00e53cbf 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -202,7 +202,8 @@
 #include "opts.h"
 #include "util.h"
 
-#include <linux/dynamic_fault.h>
+#define dynamic_fault(...)		0
+#define race_fault(...)			0
 
 #define bch2_fs_init_fault(name)					\
 	dynamic_fault("bcachefs:bch_fs_init:" name)
@@ -734,7 +735,7 @@ struct bch_fs {
 	ZSTD_parameters		zstd_params;
 
 	struct crypto_shash	*sha256;
-	struct crypto_skcipher	*chacha20;
+	struct crypto_sync_skcipher *chacha20;
 	struct crypto_shash	*poly1305;
 
 	atomic64_t		key_version;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index a0d570f3adf0..736671112861 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -73,10 +73,6 @@ static const struct rhashtable_params bch_btree_cache_params = {
 	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
 };
 
-#ifndef PAGE_KERNEL_EXEC
-# define PAGE_KERNEL_EXEC PAGE_KERNEL
-#endif
-
 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 {
 	BUG_ON(b->data || b->aux_data);
@@ -85,8 +81,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 	if (!b->data)
 		return -ENOMEM;
 
-	b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp,
-				PAGE_KERNEL_EXEC);
+	b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
 	if (!b->aux_data) {
 		kvpfree(b->data, btree_bytes(c));
 		b->data = NULL;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 7c3fb5fb0cca..2f5097218f9c 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -597,34 +597,6 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
 		bch2_btree_iter_reinit_node(iter, b);
 }
 
-static struct nonce btree_nonce(struct bset *i, unsigned offset)
-{
-	return (struct nonce) {{
-		[0] = cpu_to_le32(offset),
-		[1] = ((__le32 *) &i->seq)[0],
-		[2] = ((__le32 *) &i->seq)[1],
-		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
-	}};
-}
-
-static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
-{
-	struct nonce nonce = btree_nonce(i, offset);
-
-	if (!offset) {
-		struct btree_node *bn = container_of(i, struct btree_node, keys);
-		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
-		bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
-			     bytes);
-
-		nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
-	}
-
-	bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
-		     vstruct_end(i) - (void *) i->_data);
-}
-
 static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 			  struct btree *b, struct bset *i,
 			  unsigned offset, int write)
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 66ebdd39f5b3..626d0f071b70 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -5,6 +5,7 @@
 #include "bkey_methods.h"
 #include "bset.h"
 #include "btree_locking.h"
+#include "checksum.h"
 #include "extents.h"
 #include "io_types.h"
 
@@ -82,6 +83,34 @@ static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *
 	return false;
 }
 
+static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
+{
+	return (struct nonce) {{
+		[0] = cpu_to_le32(offset),
+		[1] = ((__le32 *) &i->seq)[0],
+		[2] = ((__le32 *) &i->seq)[1],
+		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
+	}};
+}
+
+static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+{
+	struct nonce nonce = btree_nonce(i, offset);
+
+	if (!offset) {
+		struct btree_node *bn = container_of(i, struct btree_node, keys);
+		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+		bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
+			     bytes);
+
+		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
+	}
+
+	bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+		     vstruct_end(i) - (void *) i->_data);
+}
+
 void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
 
 void bch2_btree_build_aux_trees(struct btree *);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index d73cc8ddadac..61662750dfc0 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -391,7 +391,7 @@ static void btree_key_cache_journal_flush(struct journal *j,
 	struct btree_trans trans;
 
 	six_lock_read(&ck->c.lock, NULL, NULL);
-	key = READ_ONCE(ck->key);
+	key = ck->key;
 
 	if (ck->journal.seq != seq ||
 	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 4ebe80b05ffc..d5215b14d7d9 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -125,6 +125,7 @@ struct disk_reservation {
 struct copygc_heap_entry {
 	u8			dev;
 	u8			gen;
+	u16			fragmentation;
 	u32			sectors;
 	u64			offset;
 };
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index a01073e54a33..3d88719ba86c 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -10,7 +10,7 @@
 #include <linux/random.h>
 #include <linux/scatterlist.h>
 #include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 #include <crypto/hash.h>
 #include <crypto/poly1305.h>
 #include <crypto/skcipher.h>
@@ -68,21 +68,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
 	}
 }
 
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
 				 struct nonce nonce,
 				 struct scatterlist *sg, size_t len)
 {
-	SKCIPHER_REQUEST_ON_STACK(req, tfm);
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	int ret;
 
-	skcipher_request_set_tfm(req, tfm);
+	skcipher_request_set_sync_tfm(req, tfm);
 	skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
 
 	ret = crypto_skcipher_encrypt(req);
 	BUG_ON(ret);
 }
 
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
 			      struct nonce nonce,
 			      void *buf, size_t len)
 {
@@ -95,8 +95,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm,
 int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
 			    void *buf, size_t len)
 {
-	struct crypto_skcipher *chacha20 =
-		crypto_alloc_skcipher("chacha20", 0, 0);
+	struct crypto_sync_skcipher *chacha20 =
+		crypto_alloc_sync_skcipher("chacha20", 0, 0);
 	int ret;
 
 	if (!chacha20) {
@@ -104,7 +104,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
 		return PTR_ERR(chacha20);
 	}
 
-	ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+	ret = crypto_skcipher_setkey(&chacha20->base,
+				     (void *) key, sizeof(*key));
 	if (ret) {
 		pr_err("crypto_skcipher_setkey() error: %i", ret);
 		goto err;
@@ -112,7 +113,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
 
 	do_encrypt(chacha20, nonce, buf, len);
 err:
-	crypto_free_skcipher(chacha20);
+	crypto_free_sync_skcipher(chacha20);
 	return ret;
 }
 
@@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 			kunmap_atomic(p);
 		}
 #else
-		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+		__bio_for_each_bvec(bv, bio, *iter, *iter)
 			crc = bch2_checksum_update(type, crc,
 				page_address(bv.bv_page) + bv.bv_offset,
 				bv.bv_len);
@@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 			kunmap_atomic(p);
 		}
 #else
-		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+		__bio_for_each_bvec(bv, bio, *iter, *iter)
 			crypto_shash_update(desc,
 				page_address(bv.bv_page) + bv.bv_offset,
 				bv.bv_len);
@@ -463,7 +464,7 @@ err:
 static int bch2_alloc_ciphers(struct bch_fs *c)
 {
 	if (!c->chacha20)
-		c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
+		c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
 	if (IS_ERR(c->chacha20)) {
 		bch_err(c, "error requesting chacha20 module: %li",
 			PTR_ERR(c->chacha20));
@@ -546,7 +547,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 			goto err;
 	}
 
-	ret = crypto_skcipher_setkey(c->chacha20,
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
 			(void *) &key.key, sizeof(key.key));
 	if (ret)
 		goto err;
@@ -574,7 +575,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
 	if (!IS_ERR_OR_NULL(c->poly1305))
 		crypto_free_shash(c->poly1305);
 	if (!IS_ERR_OR_NULL(c->chacha20))
-		crypto_free_skcipher(c->chacha20);
+		crypto_free_sync_skcipher(c->chacha20);
 	if (!IS_ERR_OR_NULL(c->sha256))
 		crypto_free_shash(c->sha256);
 }
@@ -606,7 +607,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
 	if (ret)
 		goto out;
 
-	ret = crypto_skcipher_setkey(c->chacha20,
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
 			(void *) &key.key, sizeof(key.key));
 	if (ret)
 		goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 833537cc8fd0..24dee8039d57 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -7,7 +7,7 @@
 #include "super-io.h"
 
 #include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 
 static inline bool bch2_checksum_mergeable(unsigned type)
 {
@@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
 /* for skipping ahead and encrypting/decrypting at an offset: */
 static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
 {
-	EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+	EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
 
-	le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+	le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
 	return nonce;
 }
 
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 47838fd2db06..b50d2b0d5fd3 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
 	struct bvec_iter iter;
 	void *expected_start = NULL;
 
-	__bio_for_each_segment(bv, bio, iter, start) {
+	__bio_for_each_bvec(bv, bio, iter, start) {
 		if (expected_start &&
 		    expected_start != page_address(bv.bv_page) + bv.bv_offset)
 			return false;
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 4a4ec8f46108..c52b6faac9b4 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -183,7 +183,7 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe
 	case TARGET_GROUP: {
 		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
 
-		return t.group < g->nr && !g->entries[t.group].deleted
+		return g && t.group < g->nr && !g->entries[t.group].deleted
 			? &g->entries[t.group].devs
 			: NULL;
 	}
@@ -208,7 +208,7 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
 
 		rcu_read_lock();
 		g = rcu_dereference(c->disk_groups);
-		m = t.group < g->nr && !g->entries[t.group].deleted
+		m = g && t.group < g->nr && !g->entries[t.group].deleted
 			? &g->entries[t.group].devs
 			: NULL;
 
@@ -387,6 +387,7 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
 {
 	struct bch_member *mi;
 	int v = -1;
+	int ret = 0;
 
 	mutex_lock(&c->sb_lock);
 
@@ -399,14 +400,18 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
 		return v;
 	}
 
+	ret = bch2_sb_disk_groups_to_cpu(c);
+	if (ret)
+		goto unlock;
 write_sb:
 	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
 	SET_BCH_MEMBER_GROUP(mi, v + 1);
 
 	bch2_write_super(c);
+unlock:
 	mutex_unlock(&c->sb_lock);
 
-	return 0;
+	return ret;
 }
 
 int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index c8e0c37a5e1a..3d84f23c34ed 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -71,7 +71,10 @@ static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
 bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
 
 int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+
+/* Exported for userspace bcachefs-tools: */
 int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+
 void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *,
 			    unsigned);
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 425b0b806cee..5514f65378ad 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1594,7 +1594,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 	mutex_unlock(&c->ec_stripe_head_lock);
 
 	mutex_lock(&c->ec_stripe_new_lock);
-	list_for_each_entry(h, &c->ec_stripe_new_list, list) {
+	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
 		pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
 		       s->blocks.nr,
 		       bitmap_weight(s->blocks_allocated,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 2d08263f3a42..55004998536d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -603,7 +603,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
 	EBUG_ON(!PageLocked(page));
 	EBUG_ON(!PageLocked(newpage));
 
-	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+	ret = migrate_page_move_mapping(mapping, newpage, page, 0);
 	if (ret != MIGRATEPAGE_SUCCESS)
 		return ret;
 
@@ -628,10 +628,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
 
 static void bch2_readpages_end_io(struct bio *bio)
 {
+	struct bvec_iter_all iter;
 	struct bio_vec *bv;
-	unsigned i;
 
-	bio_for_each_segment_all(bv, bio, i) {
+	bio_for_each_segment_all(bv, bio, iter) {
 		struct page *page = bv->bv_page;
 
 		if (!bio->bi_status) {
@@ -783,11 +783,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 			if (!get_more)
 				break;
 
-			rcu_read_lock();
-			page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
-			rcu_read_unlock();
-
-			if (page && !radix_tree_exceptional_entry(page))
+			page = xa_load(&iter->mapping->i_pages, page_offset);
+			if (page && !xa_is_value(page))
 				break;
 
 			page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
@@ -1038,32 +1035,33 @@ static void bch2_writepage_io_done(struct closure *cl)
 					struct bch_writepage_io, cl);
 	struct bch_fs *c = io->op.c;
 	struct bio *bio = &io->op.wbio.bio;
+	struct bvec_iter_all iter;
 	struct bio_vec *bvec;
-	unsigned i, j;
+	unsigned i;
 
 	if (io->op.error) {
-		bio_for_each_segment_all(bvec, bio, i) {
+		bio_for_each_segment_all(bvec, bio, iter) {
 			struct bch_page_state *s;
 
 			SetPageError(bvec->bv_page);
-			mapping_set_error(io->inode->v.i_mapping, -EIO);
+			mapping_set_error(bvec->bv_page->mapping, -EIO);
 
 			s = __bch2_page_state(bvec->bv_page);
 			spin_lock(&s->lock);
-			for (j = 0; j < PAGE_SECTORS; j++)
-				s->s[j].nr_replicas = 0;
+			for (i = 0; i < PAGE_SECTORS; i++)
+				s->s[i].nr_replicas = 0;
 			spin_unlock(&s->lock);
 		}
 	}
 
 	if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
-		bio_for_each_segment_all(bvec, bio, i) {
+		bio_for_each_segment_all(bvec, bio, iter) {
 			struct bch_page_state *s;
 
 			s = __bch2_page_state(bvec->bv_page);
 			spin_lock(&s->lock);
-			for (j = 0; j < PAGE_SECTORS; j++)
-				s->s[j].nr_replicas = 0;
+			for (i = 0; i < PAGE_SECTORS; i++)
+				s->s[i].nr_replicas = 0;
 			spin_unlock(&s->lock);
 		}
 	}
@@ -1087,7 +1085,7 @@ static void bch2_writepage_io_done(struct closure *cl)
 	 */
 	i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, iter) {
 		struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
 
 		if (atomic_dec_and_test(&s->write_count))
@@ -1241,7 +1239,7 @@ do_io:
 
 		if (w->io &&
 		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
-		     bio_full(&w->io->op.wbio.bio) ||
+		     bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
 		     w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
 		     (BIO_MAX_PAGES * PAGE_SIZE) ||
 		     bio_end_sector(&w->io->op.wbio.bio) != sector))
@@ -1810,8 +1808,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 	struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bio *bio = &dio->op.wbio.bio;
+	struct bvec_iter_all iter;
 	struct bio_vec *bv;
-	unsigned i, unaligned;
+	unsigned unaligned;
 	bool sync = dio->sync;
 	long ret;
 
@@ -1820,7 +1819,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 
 	while (1) {
 		if (kthread)
-			use_mm(dio->mm);
+			kthread_use_mm(dio->mm);
 		BUG_ON(current->faults_disabled_mapping);
 		current->faults_disabled_mapping = mapping;
 
@@ -1828,7 +1827,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 
 		current->faults_disabled_mapping = NULL;
 		if (kthread)
-			unuse_mm(dio->mm);
+			kthread_unuse_mm(dio->mm);
 
 		if (unlikely(ret < 0))
 			goto err;
@@ -1842,7 +1841,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 			 * bio_iov_iter_get_pages was only able to get <
 			 * blocksize worth of pages:
 			 */
-			bio_for_each_segment_all(bv, bio, i)
+			bio_for_each_segment_all(bv, bio, iter)
 				put_page(bv->bv_page);
 			ret = -EFAULT;
 			goto err;
@@ -1905,7 +1904,7 @@ loop:
 			i_size_write(&inode->v, req->ki_pos);
 		spin_unlock(&inode->v.i_lock);
 
-		bio_for_each_segment_all(bv, bio, i)
+		bio_for_each_segment_all(bv, bio, iter)
 			put_page(bv->bv_page);
 		if (!dio->iter.count || dio->op.error)
 			break;
@@ -2816,235 +2815,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode,
 	} while (index <= end_index);
 }
 
-static int generic_access_check_limits(struct file *file, loff_t pos,
-				       loff_t *count)
-{
-	struct inode *inode = file->f_mapping->host;
-	loff_t max_size = inode->i_sb->s_maxbytes;
-
-	if (!(file->f_flags & O_LARGEFILE))
-		max_size = MAX_NON_LFS;
-
-	if (unlikely(pos >= max_size))
-		return -EFBIG;
-	*count = min(*count, max_size - pos);
-	return 0;
-}
-
-static int generic_write_check_limits(struct file *file, loff_t pos,
-				      loff_t *count)
-{
-	loff_t limit = rlimit(RLIMIT_FSIZE);
-
-	if (limit != RLIM_INFINITY) {
-		if (pos >= limit) {
-			send_sig(SIGXFSZ, current, 0);
-			return -EFBIG;
-		}
-		*count = min(*count, limit - pos);
-	}
-
-	return generic_access_check_limits(file, pos, count);
-}
-
-static int generic_remap_checks(struct file *file_in, loff_t pos_in,
-			 struct file *file_out, loff_t pos_out,
-			 loff_t *req_count, unsigned int remap_flags)
-{
-	struct inode *inode_in = file_in->f_mapping->host;
-	struct inode *inode_out = file_out->f_mapping->host;
-	uint64_t count = *req_count;
-	uint64_t bcount;
-	loff_t size_in, size_out;
-	loff_t bs = inode_out->i_sb->s_blocksize;
-	int ret;
-
-	/* The start of both ranges must be aligned to an fs block. */
-	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
-		return -EINVAL;
-
-	/* Ensure offsets don't wrap. */
-	if (pos_in + count < pos_in || pos_out + count < pos_out)
-		return -EINVAL;
-
-	size_in = i_size_read(inode_in);
-	size_out = i_size_read(inode_out);
-
-	/* Dedupe requires both ranges to be within EOF. */
-	if ((remap_flags & REMAP_FILE_DEDUP) &&
-	    (pos_in >= size_in || pos_in + count > size_in ||
-	     pos_out >= size_out || pos_out + count > size_out))
-		return -EINVAL;
-
-	/* Ensure the infile range is within the infile. */
-	if (pos_in >= size_in)
-		return -EINVAL;
-	count = min(count, size_in - (uint64_t)pos_in);
-
-	ret = generic_access_check_limits(file_in, pos_in, &count);
-	if (ret)
-		return ret;
-
-	ret = generic_write_check_limits(file_out, pos_out, &count);
-	if (ret)
-		return ret;
-
-	/*
-	 * If the user wanted us to link to the infile's EOF, round up to the
-	 * next block boundary for this check.
-	 *
-	 * Otherwise, make sure the count is also block-aligned, having
-	 * already confirmed the starting offsets' block alignment.
-	 */
-	if (pos_in + count == size_in) {
-		bcount = ALIGN(size_in, bs) - pos_in;
-	} else {
-		if (!IS_ALIGNED(count, bs))
-			count = ALIGN_DOWN(count, bs);
-		bcount = count;
-	}
-
-	/* Don't allow overlapped cloning within the same file. */
-	if (inode_in == inode_out &&
-	    pos_out + bcount > pos_in &&
-	    pos_out < pos_in + bcount)
-		return -EINVAL;
-
-	/*
-	 * We shortened the request but the caller can't deal with that, so
-	 * bounce the request back to userspace.
-	 */
-	if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
-		return -EINVAL;
-
-	*req_count = count;
-	return 0;
-}
-
-static int generic_remap_check_len(struct inode *inode_in,
-				   struct inode *inode_out,
-				   loff_t pos_out,
-				   loff_t *len,
-				   unsigned int remap_flags)
-{
-	u64 blkmask = i_blocksize(inode_in) - 1;
-	loff_t new_len = *len;
-
-	if ((*len & blkmask) == 0)
-		return 0;
-
-	if ((remap_flags & REMAP_FILE_DEDUP) ||
-	    pos_out + *len < i_size_read(inode_out))
-		new_len &= ~blkmask;
-
-	if (new_len == *len)
-		return 0;
-
-	if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
-		*len = new_len;
-		return 0;
-	}
-
-	return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
-				  struct file *file_out, loff_t pos_out,
-				  loff_t *len, unsigned int remap_flags)
-{
-	struct inode *inode_in = file_inode(file_in);
-	struct inode *inode_out = file_inode(file_out);
-	bool same_inode = (inode_in == inode_out);
-	int ret;
-
-	/* Don't touch certain kinds of inodes */
-	if (IS_IMMUTABLE(inode_out))
-		return -EPERM;
-
-	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
-		return -ETXTBSY;
-
-	/* Don't reflink dirs, pipes, sockets... */
-	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-		return -EISDIR;
-	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-		return -EINVAL;
-
-	/* Zero length dedupe exits immediately; reflink goes to EOF. */
-	if (*len == 0) {
-		loff_t isize = i_size_read(inode_in);
-
-		if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
-			return 0;
-		if (pos_in > isize)
-			return -EINVAL;
-		*len = isize - pos_in;
-		if (*len == 0)
-			return 0;
-	}
-
-	/* Check that we don't violate system file offset limits. */
-	ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
-			remap_flags);
-	if (ret)
-		return ret;
-
-	/* Wait for the completion of any pending IOs on both files */
-	inode_dio_wait(inode_in);
-	if (!same_inode)
-		inode_dio_wait(inode_out);
-
-	ret = filemap_write_and_wait_range(inode_in->i_mapping,
-			pos_in, pos_in + *len - 1);
-	if (ret)
-		return ret;
-
-	ret = filemap_write_and_wait_range(inode_out->i_mapping,
-			pos_out, pos_out + *len - 1);
-	if (ret)
-		return ret;
-
-	/*
-	 * Check that the extents are the same.
-	 */
-	if (remap_flags & REMAP_FILE_DEDUP) {
-		bool		is_same = false;
-
-		ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
-				inode_out, pos_out, *len, &is_same);
-		if (ret)
-			return ret;
-		if (!is_same)
-			return -EBADE;
-	}
-
-	ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
-			remap_flags);
-	if (ret)
-		return ret;
-
-	/* If can't alter the file contents, we're done. */
-	if (!(remap_flags & REMAP_FILE_DEDUP)) {
-		/* Update the timestamps, since we can alter file contents. */
-		if (!(file_out->f_mode & FMODE_NOCMTIME)) {
-			ret = file_update_time(file_out);
-			if (ret)
-				return ret;
-		}
-
-		/*
-		 * Clear the security bits if the process is not being run by
-		 * root.  This keeps people from modifying setuid and setgid
-		 * binaries.
-		 */
-		ret = file_remove_privs(file_out);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 			     struct file *file_dst, loff_t pos_dst,
 			     loff_t len, unsigned remap_flags)
@@ -3240,7 +3010,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
 	loff_t ret = -1;
 
 	page = find_lock_entry(mapping, index);
-	if (!page || radix_tree_exception(page))
+	if (!page || xa_is_value(page))
 		return offset;
 
 	pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 1b593ea707d5..7063556d289b 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -35,10 +35,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
 int bch2_truncate(struct bch_inode_info *, struct iattr *);
 long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
 
-#define REMAP_FILE_ADVISORY		(0)
-#define REMAP_FILE_DEDUP		(1 << 0)
-#define REMAP_FILE_CAN_SHORTEN		(1 << 1)
-
 loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
 			     loff_t, loff_t, unsigned);
 
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 031e6d931171..0873d2f0928c 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -138,6 +138,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
 	if (fa.fsx_projid >= U32_MAX)
 		return -EINVAL;
 
+	/*
+	 * inode fields accessible via the xattr interface are stored with a +1
+	 * bias, so that 0 means unset:
+	 */
 	s.projid = fa.fsx_projid + 1;
 
 	ret = mnt_want_write_file(file);
@@ -151,7 +155,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
 	}
 
 	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_set_projid(c, inode, s.projid);
+	ret = bch2_set_projid(c, inode, fa.fsx_projid);
 	if (ret)
 		goto err_unlock;
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ba73e5258e8d..e504e6b19abe 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -25,6 +25,7 @@
 #include <linux/aio.h>
 #include <linux/backing-dev.h>
 #include <linux/exportfs.h>
+#include <linux/fiemap.h>
 #include <linux/module.h>
 #include <linux/posix_acl.h>
 #include <linux/random.h>
@@ -860,6 +861,10 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	bool have_extent = false;
 	int ret = 0;
 
+	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
 	if (start + len < start)
 		return -EINVAL;
 
@@ -966,15 +971,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
 	return bch2_readdir(c, inode->v.i_ino, ctx);
 }
 
-static int bch2_clone_file_range(struct file *file_src, loff_t pos_src,
-				 struct file *file_dst, loff_t pos_dst,
-				 u64 len)
-{
-	return bch2_remap_file_range(file_src, pos_src,
-				     file_dst, pos_dst,
-				     len, 0);
-}
-
 static const struct file_operations bch_file_operations = {
 	.llseek		= bch2_llseek,
 	.read_iter	= bch2_read_iter,
@@ -992,7 +988,7 @@ static const struct file_operations bch_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= bch2_compat_fs_ioctl,
 #endif
-	.clone_file_range = bch2_clone_file_range,
+	.remap_file_range = bch2_remap_file_range,
 };
 
 static const struct inode_operations bch_file_inode_operations = {
@@ -1245,8 +1241,8 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_blocks	= usage.capacity >> shift;
 	buf->f_bfree	= (usage.capacity - usage.used) >> shift;
 	buf->f_bavail	= buf->f_bfree;
-	buf->f_files	= usage.nr_inodes;
-	buf->f_ffree	= U64_MAX;
+	buf->f_files	= 0;
+	buf->f_ffree	= 0;
 
 	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
 	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
@@ -1410,6 +1406,24 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 	return ret;
 }
 
+static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
+{
+	struct bch_fs *c = root->d_sb->s_fs_info;
+	struct bch_dev *ca;
+	unsigned i;
+	bool first = true;
+
+	for_each_online_member(ca, c, i) {
+		if (!first)
+			seq_putc(seq, ':');
+		first = false;
+		seq_puts(seq, "/dev/");
+		seq_puts(seq, ca->name);
+	}
+
+	return 0;
+}
+
 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct bch_fs *c = root->d_sb->s_fs_info;
@@ -1433,7 +1447,6 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 	}
 
 	return 0;
-
 }
 
 static const struct super_operations bch_super_operations = {
@@ -1443,6 +1456,7 @@ static const struct super_operations bch_super_operations = {
 	.evict_inode	= bch2_evict_inode,
 	.sync_fs	= bch2_sync_fs,
 	.statfs		= bch2_statfs,
+	.show_devname	= bch2_show_devname,
 	.show_options	= bch2_show_options,
 	.remount_fs	= bch2_remount,
 #if 0
@@ -1523,7 +1537,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 
 	sb->s_bdi->congested_fn		= bch2_congested;
 	sb->s_bdi->congested_data	= c;
-	sb->s_bdi->ra_pages		= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
 
 	for_each_online_member(ca, c, i) {
 		struct block_device *bdev = ca->disk_sb.bdev;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c6ca5968a2e0..5a6df3d1973a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1265,6 +1265,8 @@ static int check_inode(struct btree_trans *trans,
 		      u.bi_inum))) {
 		bch_verbose(c, "deleting inode %llu", u.bi_inum);
 
+		bch2_fs_lazy_rw(c);
+
 		ret = bch2_inode_rm(c, u.bi_inum);
 		if (ret)
 			bch_err(c, "error in fsck: error %i while deleting inode", ret);
@@ -1277,6 +1279,8 @@ static int check_inode(struct btree_trans *trans,
 		      u.bi_inum))) {
 		bch_verbose(c, "truncating inode %llu", u.bi_inum);
 
+		bch2_fs_lazy_rw(c);
+
 		/*
 		 * XXX: need to truncate partial blocks too here - or ideally
 		 * just switch units to bytes and that issue goes away
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 4fad37fdee25..5c9c3cf54edd 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -54,7 +54,9 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
 		return false;
 
 	rcu_read_lock();
-	devs = bch2_target_to_mask(c, target);
+	devs = bch2_target_to_mask(c, target) ?:
+		&c->rw_devs[BCH_DATA_user];
+
 	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
 		ca = rcu_dereference(c->devs[d]);
 		if (!ca)
@@ -132,10 +134,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
 
 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
 {
+	struct bvec_iter_all iter;
 	struct bio_vec *bv;
-	unsigned i;
 
-	bio_for_each_segment_all(bv, bio, i)
+	bio_for_each_segment_all(bv, bio, iter)
 		if (bv->bv_page != ZERO_PAGE(0))
 			mempool_free(bv->bv_page, &c->bio_bounce_pages);
 	bio->bi_vcnt = 0;
@@ -471,7 +473,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
 		n->c			= c;
 		n->dev			= ptr->dev;
-		n->have_ioref		= bch2_dev_get_ioref(ca, WRITE);
+		n->have_ioref		= bch2_dev_get_ioref(ca,
+					type == BCH_DATA_btree ? READ : WRITE);
 		n->submit_time		= local_clock();
 		n->bio.bi_iter.bi_sector = ptr->offset;
 
@@ -1091,6 +1094,11 @@ again:
 			goto err;
 		}
 
+		/*
+		 * The copygc thread is now global, which means it's no longer
+		 * freeing up space on specific disks, which means that
+		 * allocations for specific disks may hang arbitrarily long:
+		 */
 		wp = bch2_alloc_sectors_start(c,
 			op->target,
 			op->opts.erasure_code,
@@ -1100,7 +1108,8 @@ again:
 			op->nr_replicas_required,
 			op->alloc_reserve,
 			op->flags,
-			(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
+			(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+				      BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
 		EBUG_ON(!wp);
 
 		if (unlikely(IS_ERR(wp))) {
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 1dde0b5d963f..56438840efd7 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -281,7 +281,7 @@ static inline void bch2_journal_res_put(struct journal *j,
 	if (!res->ref)
 		return;
 
-	lock_release(&j->res_map, 0, _THIS_IP_);
+	lock_release(&j->res_map, _THIS_IP_);
 
 	while (res->u64s)
 		bch2_journal_add_entry(j, res,
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 89585833c846..bd0e6b371701 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -29,9 +29,11 @@ struct journal_list {
  * be replayed:
  */
 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
-			     struct journal_list *jlist, struct jset *j)
+			     struct journal_list *jlist, struct jset *j,
+			     bool bad)
 {
 	struct journal_replay *i, *pos;
+	struct bch_devs_list devs = { .nr = 0 };
 	struct list_head *where;
 	size_t bytes = vstruct_bytes(j);
 	__le64 last_seq;
@@ -60,8 +62,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	list_for_each_entry_reverse(i, jlist->head, list) {
-		/* Duplicate? */
-		if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
+			where = &i->list;
+			goto add;
+		}
+	}
+
+	where = jlist->head;
+add:
+	i = where->next != jlist->head
+		? container_of(where->next, struct journal_replay, list)
+		: NULL;
+
+	/*
+	 * Duplicate journal entries? If so we want the one that didn't have a
+	 * checksum error:
+	 */
+	if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+		if (i->bad) {
+			devs = i->devs;
+			list_del(&i->list);
+			kvpfree(i, offsetof(struct journal_replay, j) +
+				vstruct_bytes(&i->j));
+		} else if (bad) {
+			goto found;
+		} else {
 			fsck_err_on(bytes != vstruct_bytes(&i->j) ||
 				    memcmp(j, &i->j, bytes), c,
 				    "found duplicate but non identical journal entries (seq %llu)",
@@ -69,14 +94,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 			goto found;
 		}
 
-		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
-			where = &i->list;
-			goto add;
-		}
 	}
 
-	where = jlist->head;
-add:
 	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
 	if (!i) {
 		ret = -ENOMEM;
@@ -84,7 +103,8 @@ add:
 	}
 
 	list_add(&i->list, where);
-	i->devs.nr = 0;
+	i->devs = devs;
+	i->bad	= bad;
 	memcpy(&i->j, j, bytes);
 found:
 	if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@@ -391,6 +411,7 @@ fsck_err:
 }
 
 static int jset_validate(struct bch_fs *c,
+			 struct bch_dev *ca,
 			 struct jset *jset, u64 sector,
 			 unsigned bucket_sectors_left,
 			 unsigned sectors_read,
@@ -405,16 +426,19 @@ static int jset_validate(struct bch_fs *c,
 		return JOURNAL_ENTRY_NONE;
 
 	version = le32_to_cpu(jset->version);
-	if ((version != BCH_JSET_VERSION_OLD &&
-	     version < bcachefs_metadata_version_min) ||
-	    version >= bcachefs_metadata_version_max) {
-		bch_err(c, "unknown journal entry version %u", jset->version);
-		return BCH_FSCK_UNKNOWN_VERSION;
+	if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
+				  version < bcachefs_metadata_version_min) ||
+				 version >= bcachefs_metadata_version_max, c,
+			"%s sector %llu seq %llu: unknown journal entry version %u",
+			ca->name, sector, le64_to_cpu(jset->seq),
+			version)) {
+		/* XXX: note we might have missing journal entries */
+		return JOURNAL_ENTRY_BAD;
 	}
 
 	if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
-				 "journal entry too big (%zu bytes), sector %lluu",
-				 bytes, sector)) {
+			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+			ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
 		/* XXX: note we might have missing journal entries */
 		return JOURNAL_ENTRY_BAD;
 	}
@@ -423,13 +447,15 @@ static int jset_validate(struct bch_fs *c,
 		return JOURNAL_ENTRY_REREAD;
 
 	if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
-			"journal entry with unknown csum type %llu sector %lluu",
-			JSET_CSUM_TYPE(jset), sector))
+			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
+			ca->name, sector, le64_to_cpu(jset->seq),
+			JSET_CSUM_TYPE(jset)))
 		return JOURNAL_ENTRY_BAD;
 
 	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
 	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
-				 "journal checksum bad, sector %llu", sector)) {
+				 "%s sector %llu seq %llu: journal checksum bad",
+				 ca->name, sector, le64_to_cpu(jset->seq))) {
 		/* XXX: retry IO, when we start retrying checksum errors */
 		/* XXX: note we might have missing journal entries */
 		return JOURNAL_ENTRY_BAD;
@@ -440,8 +466,10 @@ static int jset_validate(struct bch_fs *c,
 		     vstruct_end(jset) - (void *) jset->encrypted_start);
 
 	if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
-				 "invalid journal entry: last_seq > seq"))
+				 "invalid journal entry: last_seq > seq")) {
 		jset->last_seq = jset->seq;
+		return JOURNAL_ENTRY_BAD;
+	}
 
 	return 0;
 fsck_err:
@@ -516,11 +544,12 @@ reread:
 			j = buf->data;
 		}
 
-		ret = jset_validate(c, j, offset,
+		ret = jset_validate(c, ca, j, offset,
 				    end - offset, sectors_read,
 				    READ);
 		switch (ret) {
 		case BCH_FSCK_OK:
+			sectors = vstruct_sectors(j, c->block_bits);
 			break;
 		case JOURNAL_ENTRY_REREAD:
 			if (vstruct_bytes(j) > buf->size) {
@@ -537,8 +566,13 @@ reread:
 			goto next_block;
 		case JOURNAL_ENTRY_BAD:
 			saw_bad = true;
+			/*
+			 * On checksum error we don't really trust the size
+			 * field of the journal entry we read, so try reading
+			 * again at next block boundary:
+			 */
 			sectors = c->opts.block_size;
-			goto next_block;
+			break;
 		default:
 			return ret;
 		}
@@ -555,7 +589,7 @@ reread:
 		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
 		mutex_lock(&jlist->lock);
-		ret = journal_entry_add(c, ca, jlist, j);
+		ret = journal_entry_add(c, ca, jlist, j, ret != 0);
 		mutex_unlock(&jlist->lock);
 
 		switch (ret) {
@@ -566,8 +600,6 @@ reread:
 		default:
 			return ret;
 		}
-
-		sectors = vstruct_sectors(j, c->block_bits);
 next_block:
 		pr_debug("next");
 		offset		+= sectors;
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 72e575f360af..6958ee0f8cf2 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -9,6 +9,8 @@
 struct journal_replay {
 	struct list_head	list;
 	struct bch_devs_list	devs;
+	/* checksum error, but we may want to try using it anyways: */
+	bool			bad;
 	/* must be last: */
 	struct jset		j;
 };
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index a21de0088753..d0f1bbf8f6a7 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -36,15 +36,6 @@
  * that bset, until that btree node is rewritten.
  */
 
-static unsigned
-blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
-{
-	return bl
-		? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
-		   sizeof(struct journal_seq_blacklist_entry))
-		: 0;
-}
-
 static unsigned sb_blacklist_u64s(unsigned nr)
 {
 	struct bch_sb_field_journal_seq_blacklist *bl;
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index 03f4b97247fd..afb886ec8e25 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -2,6 +2,15 @@
 #ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
 #define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
 
+static inline unsigned
+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
+{
+	return bl
+		? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
+		   sizeof(struct journal_seq_blacklist_entry))
+		: 0;
+}
+
 bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
 int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
 int bch2_blacklist_table_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 4a2c4debd3f0..2f3be487ef65 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -320,12 +320,12 @@ static void move_free(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct moving_context *ctxt = io->write.ctxt;
+	struct bvec_iter_all iter;
 	struct bio_vec *bv;
-	unsigned i;
 
 	bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
 
-	bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
+	bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
 		if (bv->bv_page)
 			__free_page(bv->bv_page);
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 55aa463f992f..de0a7974ec9f 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -44,13 +44,6 @@
 #define COPYGC_BUCKETS_PER_ITER(ca)					\
 	((ca)->free[RESERVE_MOVINGGC].size / 2)
 
-static inline int sectors_used_cmp(copygc_heap *heap,
-				   struct copygc_heap_entry l,
-				   struct copygc_heap_entry r)
-{
-	return cmp_int(l.sectors, r.sectors);
-}
-
 static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 {
 	const struct copygc_heap_entry *l = _l;
@@ -123,6 +116,13 @@ static bool have_copygc_reserve(struct bch_dev *ca)
 	return ret;
 }
 
+static inline int fragmentation_cmp(copygc_heap *heap,
+				   struct copygc_heap_entry l,
+				   struct copygc_heap_entry r)
+{
+	return cmp_int(l.fragmentation, r.fragmentation);
+}
+
 static int bch2_copygc(struct bch_fs *c)
 {
 	copygc_heap *h = &c->copygc_heap;
@@ -180,10 +180,12 @@ static int bch2_copygc(struct bch_fs *c)
 			e = (struct copygc_heap_entry) {
 				.dev		= dev_idx,
 				.gen		= m.gen,
+				.fragmentation	= bucket_sectors_used(m) * (1U << 15)
+					/ ca->mi.bucket_size,
 				.sectors	= bucket_sectors_used(m),
 				.offset		= bucket_to_sector(ca, b),
 			};
-			heap_add_or_replace(h, e, -sectors_used_cmp, NULL);
+			heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
 		}
 		up_read(&ca->bucket_lock);
 	}
@@ -197,7 +199,7 @@ static int bch2_copygc(struct bch_fs *c)
 		sectors_to_move += i->sectors;
 
 	while (sectors_to_move > sectors_reserved) {
-		BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL));
+		BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
 		sectors_to_move -= e.sectors;
 	}
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index d6a832a38b20..014c608ca0c6 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -83,7 +83,7 @@ enum opt_type {
 	  "size",	NULL)						\
 	x(btree_node_size,		u16,				\
 	  OPT_FORMAT,							\
-	  OPT_SECTORS(1, 128),						\
+	  OPT_SECTORS(1, 512),						\
 	  BCH_SB_BTREE_NODE_SIZE,	512,				\
 	  "size",	"Btree node size, default 256k")		\
 	x(errors,			u8,				\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 28972f30e198..6e829bf0a31f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1039,6 +1039,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 		}
 
 		journal_seq += 4;
+
+		/*
+		 * The superblock needs to be written before we do any btree
+		 * node writes: it will be in the read_write() path
+		 */
 	}
 
 	ret = bch2_blacklist_table_initialize(c);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 1d9a6bfa8c13..30be083b09bf 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -169,10 +169,9 @@ int bch2_congested(void *data, int bdi_bits)
 			}
 		}
 	} else {
-		unsigned target = READ_ONCE(c->opts.foreground_target);
-		const struct bch_devs_mask *devs = target
-			? bch2_target_to_mask(c, target)
-			: &c->rw_devs[BCH_DATA_user];
+		const struct bch_devs_mask *devs =
+			bch2_target_to_mask(c, c->opts.foreground_target) ?:
+			&c->rw_devs[BCH_DATA_user];
 
 		for_each_member_device_rcu(ca, c, i, devs) {
 			bdi = ca->disk_sb.bdev->bd_bdi;
@@ -384,8 +383,8 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
 {
 	bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
 
-	bch2_fs_read_only_async(c);
 	bch2_journal_halt(&c->journal);
+	bch2_fs_read_only_async(c);
 
 	wake_up(&bch_read_only_wait);
 	return ret;
@@ -442,6 +441,13 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	if (ret)
 		goto err;
 
+	/*
+	 * We need to write out a journal entry before we start doing btree
+	 * updates, to ensure that on unclean shutdown new journal blacklist
+	 * entries are created:
+	 */
+	bch2_journal_meta(&c->journal);
+
 	clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
 
 	for_each_rw_member(ca, c, i)
@@ -1820,7 +1826,6 @@ err:
 /* return with ref on ca->ref: */
 struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
 {
-
 	struct block_device *bdev = lookup_bdev(path);
 	struct bch_dev *ca;
 	unsigned i;
@@ -1845,6 +1850,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 {
 	struct bch_sb_handle *sb = NULL;
 	struct bch_fs *c = NULL;
+	struct bch_sb_field_members *mi;
 	unsigned i, best_sb = 0;
 	const char *err;
 	int ret = -ENOMEM;
@@ -1880,10 +1886,24 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 		    le64_to_cpu(sb[best_sb].sb->seq))
 			best_sb = i;
 
-	for (i = 0; i < nr_devices; i++) {
+	mi = bch2_sb_get_members(sb[best_sb].sb);
+
+	i = 0;
+	while (i < nr_devices) {
+		if (i != best_sb &&
+		    !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) {
+			char buf[BDEVNAME_SIZE];
+			pr_info("%s has been removed, skipping",
+				bdevname(sb[i].bdev, buf));
+			bch2_free_super(&sb[i]);
+			array_remove_item(sb, nr_devices, i);
+			continue;
+		}
+
 		err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
 		if (err)
 			goto err_print;
+		i++;
 	}
 
 	ret = -ENOMEM;
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 4aa5dd7917cf..fffee96726ce 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -222,6 +222,15 @@ void bch2_fs_read_only(struct bch_fs *);
 int bch2_fs_read_write(struct bch_fs *);
 int bch2_fs_read_write_early(struct bch_fs *);
 
+/*
+ * Only for use in the recovery/fsck path:
+ */
+static inline void bch2_fs_lazy_rw(struct bch_fs *c)
+{
+	if (percpu_ref_is_zero(&c->writes))
+		bch2_fs_read_write_early(c);
+}
+
 void bch2_fs_stop(struct bch_fs *);
 
 int bch2_fs_start(struct bch_fs *);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 119c86122023..f48c6380684f 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -99,7 +99,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
 {
 	return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
 					 get_order(size)) ?:
-		__vmalloc(size, gfp_mask, PAGE_KERNEL);
+		__vmalloc(size, gfp_mask);
 }
 
 static inline void kvpfree(void *p, size_t size)
@@ -664,35 +664,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
 	memset(s + bytes, c, rem);
 }
 
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
-					      struct bvec_iter *iter)
-{
-	struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
-	bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
-	while (iter->bi_size) {
-		struct bio_vec next = bio_iter_iovec(bio, *iter);
-
-		if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
-		    page_address(next.bv_page) + next.bv_offset)
-			break;
-
-		bv.bv_len += next.bv_len;
-		bio_advance_iter(bio, iter, next.bv_len);
-	}
-#endif
-	return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start)		\
-	for (iter = (start);						\
-	     (iter).bi_size &&						\
-		((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter)			\
-	__bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
 void sort_cmp_size(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *, size_t),
 	  void (*swap_func)(void *, void *, size_t));
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 725a6f3ef8ce..21f64cb7e402 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -511,7 +511,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 
 	mutex_lock(&inode->ei_update_lock);
 	if (inode_opt_id == Inode_opt_project) {
-		ret = bch2_set_projid(c, inode, s.v);
+		/*
+		 * inode fields accessible via the xattr interface are stored
+		 * with a +1 bias, so that 0 means unset:
+		 */
+		ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
 		if (ret)
 			goto err;
 	}
author	Kent Overstreet <kent.overstreet@gmail.com>	2020-09-07 14:13:00 -0400
committer	Kent Overstreet <kent.overstreet@gmail.com>	2020-09-07 14:13:00 -0400
commit	4e21b048c317fac4ca43eb7cdcf8918f84dec12a (patch)
tree	b6c13ff97bb697d0c795543096cc87fee92b4c17
parent	f54540c1c0413ebb280b68c4aa2d68ab8ba6b70e (diff)