summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2020-09-07 14:13:00 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2020-09-07 14:13:00 -0400
commit4e21b048c317fac4ca43eb7cdcf8918f84dec12a (patch)
treeb6c13ff97bb697d0c795543096cc87fee92b4c17
parentf54540c1c0413ebb280b68c4aa2d68ab8ba6b70e (diff)
Merge with fb2821e726bcachefs: Don't fail mount if device has been removed
-rw-r--r--fs/bcachefs/alloc_background.c2
-rw-r--r--fs/bcachefs/bcachefs.h5
-rw-r--r--fs/bcachefs/btree_cache.c7
-rw-r--r--fs/bcachefs/btree_io.c28
-rw-r--r--fs/bcachefs/btree_io.h29
-rw-r--r--fs/bcachefs/btree_key_cache.c2
-rw-r--r--fs/bcachefs/buckets_types.h1
-rw-r--r--fs/bcachefs/checksum.c31
-rw-r--r--fs/bcachefs/checksum.h6
-rw-r--r--fs/bcachefs/compress.c2
-rw-r--r--fs/bcachefs/disk_groups.c11
-rw-r--r--fs/bcachefs/disk_groups.h3
-rw-r--r--fs/bcachefs/ec.c2
-rw-r--r--fs/bcachefs/fs-io.c276
-rw-r--r--fs/bcachefs/fs-io.h4
-rw-r--r--fs/bcachefs/fs-ioctl.c6
-rw-r--r--fs/bcachefs/fs.c42
-rw-r--r--fs/bcachefs/fsck.c4
-rw-r--r--fs/bcachefs/io.c19
-rw-r--r--fs/bcachefs/journal.h2
-rw-r--r--fs/bcachefs/journal_io.c84
-rw-r--r--fs/bcachefs/journal_io.h2
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c9
-rw-r--r--fs/bcachefs/journal_seq_blacklist.h9
-rw-r--r--fs/bcachefs/move.c4
-rw-r--r--fs/bcachefs/movinggc.c20
-rw-r--r--fs/bcachefs/opts.h2
-rw-r--r--fs/bcachefs/recovery.c5
-rw-r--r--fs/bcachefs/super.c34
-rw-r--r--fs/bcachefs/super.h9
-rw-r--r--fs/bcachefs/util.h31
-rw-r--r--fs/bcachefs/xattr.c6
32 files changed, 274 insertions, 423 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 43b9f99194b9..9aa0b42b26b6 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -350,6 +350,8 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
bch2_btree_iter_set_pos(iter, POS(i, first_bucket));
while (1) {
+ bch2_trans_cond_resched(&trans);
+
ret = bch2_alloc_write_key(&trans, iter, flags);
if (ret < 0 || ret == ALLOC_END)
break;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 06bb267e94f1..3a5a00e53cbf 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -202,7 +202,8 @@
#include "opts.h"
#include "util.h"
-#include <linux/dynamic_fault.h>
+#define dynamic_fault(...) 0
+#define race_fault(...) 0
#define bch2_fs_init_fault(name) \
dynamic_fault("bcachefs:bch_fs_init:" name)
@@ -734,7 +735,7 @@ struct bch_fs {
ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
- struct crypto_skcipher *chacha20;
+ struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index a0d570f3adf0..736671112861 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -73,10 +73,6 @@ static const struct rhashtable_params bch_btree_cache_params = {
.obj_cmpfn = bch2_btree_cache_cmp_fn,
};
-#ifndef PAGE_KERNEL_EXEC
-# define PAGE_KERNEL_EXEC PAGE_KERNEL
-#endif
-
static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
@@ -85,8 +81,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
if (!b->data)
return -ENOMEM;
- b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp,
- PAGE_KERNEL_EXEC);
+ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
if (!b->aux_data) {
kvpfree(b->data, btree_bytes(c));
b->data = NULL;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 7c3fb5fb0cca..2f5097218f9c 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -597,34 +597,6 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
bch2_btree_iter_reinit_node(iter, b);
}
-static struct nonce btree_nonce(struct bset *i, unsigned offset)
-{
- return (struct nonce) {{
- [0] = cpu_to_le32(offset),
- [1] = ((__le32 *) &i->seq)[0],
- [2] = ((__le32 *) &i->seq)[1],
- [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
- }};
-}
-
-static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
-{
- struct nonce nonce = btree_nonce(i, offset);
-
- if (!offset) {
- struct btree_node *bn = container_of(i, struct btree_node, keys);
- unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
- bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
- bytes);
-
- nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
- }
-
- bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
- vstruct_end(i) - (void *) i->_data);
-}
-
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct btree *b, struct bset *i,
unsigned offset, int write)
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 66ebdd39f5b3..626d0f071b70 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -5,6 +5,7 @@
#include "bkey_methods.h"
#include "bset.h"
#include "btree_locking.h"
+#include "checksum.h"
#include "extents.h"
#include "io_types.h"
@@ -82,6 +83,34 @@ static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *
return false;
}
+static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
+{
+ return (struct nonce) {{
+ [0] = cpu_to_le32(offset),
+ [1] = ((__le32 *) &i->seq)[0],
+ [2] = ((__le32 *) &i->seq)[1],
+ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
+ }};
+}
+
+static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+{
+ struct nonce nonce = btree_nonce(i, offset);
+
+ if (!offset) {
+ struct btree_node *bn = container_of(i, struct btree_node, keys);
+ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
+ bytes);
+
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
+ }
+
+ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+ vstruct_end(i) - (void *) i->_data);
+}
+
void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
void bch2_btree_build_aux_trees(struct btree *);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index d73cc8ddadac..61662750dfc0 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -391,7 +391,7 @@ static void btree_key_cache_journal_flush(struct journal *j,
struct btree_trans trans;
six_lock_read(&ck->c.lock, NULL, NULL);
- key = READ_ONCE(ck->key);
+ key = ck->key;
if (ck->journal.seq != seq ||
!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 4ebe80b05ffc..d5215b14d7d9 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -125,6 +125,7 @@ struct disk_reservation {
struct copygc_heap_entry {
u8 dev;
u8 gen;
+ u16 fragmentation;
u32 sectors;
u64 offset;
};
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index a01073e54a33..3d88719ba86c 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -10,7 +10,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
#include <crypto/skcipher.h>
@@ -68,21 +68,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
}
}
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
struct scatterlist *sg, size_t len)
{
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
BUG_ON(ret);
}
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
@@ -95,8 +95,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm,
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
- struct crypto_skcipher *chacha20 =
- crypto_alloc_skcipher("chacha20", 0, 0);
+ struct crypto_sync_skcipher *chacha20 =
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20) {
@@ -104,7 +104,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
return PTR_ERR(chacha20);
}
- ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ ret = crypto_skcipher_setkey(&chacha20->base,
+ (void *) key, sizeof(*key));
if (ret) {
pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
@@ -112,7 +113,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
do_encrypt(chacha20, nonce, buf, len);
err:
- crypto_free_skcipher(chacha20);
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
@@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crc = bch2_checksum_update(type, crc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -463,7 +464,7 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
- c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
if (IS_ERR(c->chacha20)) {
bch_err(c, "error requesting chacha20 module: %li",
PTR_ERR(c->chacha20));
@@ -546,7 +547,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
goto err;
}
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
@@ -574,7 +575,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_skcipher(c->chacha20);
+ crypto_free_sync_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
@@ -606,7 +607,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
if (ret)
goto out;
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 833537cc8fd0..24dee8039d57 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -7,7 +7,7 @@
#include "super-io.h"
#include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
static inline bool bch2_checksum_mergeable(unsigned type)
{
@@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
/* for skipping ahead and encrypting/decrypting at an offset: */
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
{
- EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
- le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
return nonce;
}
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 47838fd2db06..b50d2b0d5fd3 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
struct bvec_iter iter;
void *expected_start = NULL;
- __bio_for_each_segment(bv, bio, iter, start) {
+ __bio_for_each_bvec(bv, bio, iter, start) {
if (expected_start &&
expected_start != page_address(bv.bv_page) + bv.bv_offset)
return false;
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 4a4ec8f46108..c52b6faac9b4 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -183,7 +183,7 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe
case TARGET_GROUP: {
struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
- return t.group < g->nr && !g->entries[t.group].deleted
+ return g && t.group < g->nr && !g->entries[t.group].deleted
? &g->entries[t.group].devs
: NULL;
}
@@ -208,7 +208,7 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
rcu_read_lock();
g = rcu_dereference(c->disk_groups);
- m = t.group < g->nr && !g->entries[t.group].deleted
+ m = g && t.group < g->nr && !g->entries[t.group].deleted
? &g->entries[t.group].devs
: NULL;
@@ -387,6 +387,7 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
{
struct bch_member *mi;
int v = -1;
+ int ret = 0;
mutex_lock(&c->sb_lock);
@@ -399,14 +400,18 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
return v;
}
+ ret = bch2_sb_disk_groups_to_cpu(c);
+ if (ret)
+ goto unlock;
write_sb:
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
SET_BCH_MEMBER_GROUP(mi, v + 1);
bch2_write_super(c);
+unlock:
mutex_unlock(&c->sb_lock);
- return 0;
+ return ret;
}
int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index c8e0c37a5e1a..3d84f23c34ed 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -71,7 +71,10 @@ static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+
+/* Exported for userspace bcachefs-tools: */
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+
void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *,
unsigned);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 425b0b806cee..5514f65378ad 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1594,7 +1594,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
mutex_unlock(&c->ec_stripe_head_lock);
mutex_lock(&c->ec_stripe_new_lock);
- list_for_each_entry(h, &c->ec_stripe_new_list, list) {
+ list_for_each_entry(s, &c->ec_stripe_new_list, list) {
pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
s->blocks.nr,
bitmap_weight(s->blocks_allocated,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 2d08263f3a42..55004998536d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -603,7 +603,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
EBUG_ON(!PageLocked(page));
EBUG_ON(!PageLocked(newpage));
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
@@ -628,10 +628,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
static void bch2_readpages_end_io(struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, iter) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -783,11 +783,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
if (!get_more)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
- rcu_read_unlock();
-
- if (page && !radix_tree_exceptional_entry(page))
+ page = xa_load(&iter->mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page))
break;
page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
@@ -1038,32 +1035,33 @@ static void bch2_writepage_io_done(struct closure *cl)
struct bch_writepage_io, cl);
struct bch_fs *c = io->op.c;
struct bio *bio = &io->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bvec;
- unsigned i, j;
+ unsigned i;
if (io->op.error) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
SetPageError(bvec->bv_page);
- mapping_set_error(io->inode->v.i_mapping, -EIO);
+ mapping_set_error(bvec->bv_page->mapping, -EIO);
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
@@ -1087,7 +1085,7 @@ static void bch2_writepage_io_done(struct closure *cl)
*/
i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
if (atomic_dec_and_test(&s->write_count))
@@ -1241,7 +1239,7 @@ do_io:
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bio_full(&w->io->op.wbio.bio) ||
+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
(BIO_MAX_PAGES * PAGE_SIZE) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
@@ -1810,8 +1808,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bio *bio = &dio->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i, unaligned;
+ unsigned unaligned;
bool sync = dio->sync;
long ret;
@@ -1820,7 +1819,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
while (1) {
if (kthread)
- use_mm(dio->mm);
+ kthread_use_mm(dio->mm);
BUG_ON(current->faults_disabled_mapping);
current->faults_disabled_mapping = mapping;
@@ -1828,7 +1827,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
current->faults_disabled_mapping = NULL;
if (kthread)
- unuse_mm(dio->mm);
+ kthread_unuse_mm(dio->mm);
if (unlikely(ret < 0))
goto err;
@@ -1842,7 +1841,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
* bio_iov_iter_get_pages was only able to get <
* blocksize worth of pages:
*/
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
ret = -EFAULT;
goto err;
@@ -1905,7 +1904,7 @@ loop:
i_size_write(&inode->v, req->ki_pos);
spin_unlock(&inode->v.i_lock);
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
if (!dio->iter.count || dio->op.error)
break;
@@ -2816,235 +2815,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode,
} while (index <= end_index);
}
-static int generic_access_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
- *count = min(*count, max_size - pos);
- return 0;
-}
-
-static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- return generic_access_check_limits(file, pos, count);
-}
-
-static int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
-{
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
- int ret;
-
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_access_check_limits(file_in, pos_in, &count);
- if (ret)
- return ret;
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
-{
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if ((remap_flags & REMAP_FILE_DEDUP) ||
- pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
- int ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
-
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
- }
-
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
-
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- /* Update the timestamps, since we can alter file contents. */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- return ret;
- }
-
- /*
- * Clear the security bits if the process is not being run by
- * root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct file *file_dst, loff_t pos_dst,
loff_t len, unsigned remap_flags)
@@ -3240,7 +3010,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
loff_t ret = -1;
page = find_lock_entry(mapping, index);
- if (!page || radix_tree_exception(page))
+ if (!page || xa_is_value(page))
return offset;
pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 1b593ea707d5..7063556d289b 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -35,10 +35,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
int bch2_truncate(struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-#define REMAP_FILE_ADVISORY (0)
-#define REMAP_FILE_DEDUP (1 << 0)
-#define REMAP_FILE_CAN_SHORTEN (1 << 1)
-
loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
loff_t, loff_t, unsigned);
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 031e6d931171..0873d2f0928c 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -138,6 +138,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
if (fa.fsx_projid >= U32_MAX)
return -EINVAL;
+ /*
+ * inode fields accessible via the xattr interface are stored with a +1
+ * bias, so that 0 means unset:
+ */
s.projid = fa.fsx_projid + 1;
ret = mnt_want_write_file(file);
@@ -151,7 +155,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
- ret = bch2_set_projid(c, inode, s.projid);
+ ret = bch2_set_projid(c, inode, fa.fsx_projid);
if (ret)
goto err_unlock;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ba73e5258e8d..e504e6b19abe 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -25,6 +25,7 @@
#include <linux/aio.h>
#include <linux/backing-dev.h>
#include <linux/exportfs.h>
+#include <linux/fiemap.h>
#include <linux/module.h>
#include <linux/posix_acl.h>
#include <linux/random.h>
@@ -860,6 +861,10 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bool have_extent = false;
int ret = 0;
+ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
if (start + len < start)
return -EINVAL;
@@ -966,15 +971,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
return bch2_readdir(c, inode->v.i_ino, ctx);
}
-static int bch2_clone_file_range(struct file *file_src, loff_t pos_src,
- struct file *file_dst, loff_t pos_dst,
- u64 len)
-{
- return bch2_remap_file_range(file_src, pos_src,
- file_dst, pos_dst,
- len, 0);
-}
-
static const struct file_operations bch_file_operations = {
.llseek = bch2_llseek,
.read_iter = bch2_read_iter,
@@ -992,7 +988,7 @@ static const struct file_operations bch_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl,
#endif
- .clone_file_range = bch2_clone_file_range,
+ .remap_file_range = bch2_remap_file_range,
};
static const struct inode_operations bch_file_inode_operations = {
@@ -1245,8 +1241,8 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = usage.capacity >> shift;
buf->f_bfree = (usage.capacity - usage.used) >> shift;
buf->f_bavail = buf->f_bfree;
- buf->f_files = usage.nr_inodes;
- buf->f_ffree = U64_MAX;
+ buf->f_files = 0;
+ buf->f_ffree = 0;
fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
@@ -1410,6 +1406,24 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
return ret;
}
+static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
+{
+ struct bch_fs *c = root->d_sb->s_fs_info;
+ struct bch_dev *ca;
+ unsigned i;
+ bool first = true;
+
+ for_each_online_member(ca, c, i) {
+ if (!first)
+ seq_putc(seq, ':');
+ first = false;
+ seq_puts(seq, "/dev/");
+ seq_puts(seq, ca->name);
+ }
+
+ return 0;
+}
+
static int bch2_show_options(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
@@ -1433,7 +1447,6 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
}
return 0;
-
}
static const struct super_operations bch_super_operations = {
@@ -1443,6 +1456,7 @@ static const struct super_operations bch_super_operations = {
.evict_inode = bch2_evict_inode,
.sync_fs = bch2_sync_fs,
.statfs = bch2_statfs,
+ .show_devname = bch2_show_devname,
.show_options = bch2_show_options,
.remount_fs = bch2_remount,
#if 0
@@ -1523,7 +1537,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
sb->s_bdi->congested_fn = bch2_congested;
sb->s_bdi->congested_data = c;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c6ca5968a2e0..5a6df3d1973a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1265,6 +1265,8 @@ static int check_inode(struct btree_trans *trans,
u.bi_inum))) {
bch_verbose(c, "deleting inode %llu", u.bi_inum);
+ bch2_fs_lazy_rw(c);
+
ret = bch2_inode_rm(c, u.bi_inum);
if (ret)
bch_err(c, "error in fsck: error %i while deleting inode", ret);
@@ -1277,6 +1279,8 @@ static int check_inode(struct btree_trans *trans,
u.bi_inum))) {
bch_verbose(c, "truncating inode %llu", u.bi_inum);
+ bch2_fs_lazy_rw(c);
+
/*
* XXX: need to truncate partial blocks too here - or ideally
* just switch units to bytes and that issue goes away
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 4fad37fdee25..5c9c3cf54edd 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -54,7 +54,9 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
return false;
rcu_read_lock();
- devs = bch2_target_to_mask(c, target);
+ devs = bch2_target_to_mask(c, target) ?:
+ &c->rw_devs[BCH_DATA_user];
+
for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
ca = rcu_dereference(c->devs[d]);
if (!ca)
@@ -132,10 +134,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
if (bv->bv_page != ZERO_PAGE(0))
mempool_free(bv->bv_page, &c->bio_bounce_pages);
bio->bi_vcnt = 0;
@@ -471,7 +473,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->c = c;
n->dev = ptr->dev;
- n->have_ioref = bch2_dev_get_ioref(ca, WRITE);
+ n->have_ioref = bch2_dev_get_ioref(ca,
+ type == BCH_DATA_btree ? READ : WRITE);
n->submit_time = local_clock();
n->bio.bi_iter.bi_sector = ptr->offset;
@@ -1091,6 +1094,11 @@ again:
goto err;
}
+ /*
+ * The copygc thread is now global, which means it's no longer
+ * freeing up space on specific disks, which means that
+ * allocations for specific disks may hang arbitrarily long:
+ */
wp = bch2_alloc_sectors_start(c,
op->target,
op->opts.erasure_code,
@@ -1100,7 +1108,8 @@ again:
op->nr_replicas_required,
op->alloc_reserve,
op->flags,
- (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
+ (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+ BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
EBUG_ON(!wp);
if (unlikely(IS_ERR(wp))) {
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 1dde0b5d963f..56438840efd7 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -281,7 +281,7 @@ static inline void bch2_journal_res_put(struct journal *j,
if (!res->ref)
return;
- lock_release(&j->res_map, 0, _THIS_IP_);
+ lock_release(&j->res_map, _THIS_IP_);
while (res->u64s)
bch2_journal_add_entry(j, res,
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 89585833c846..bd0e6b371701 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -29,9 +29,11 @@ struct journal_list {
* be replayed:
*/
static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
- struct journal_list *jlist, struct jset *j)
+ struct journal_list *jlist, struct jset *j,
+ bool bad)
{
struct journal_replay *i, *pos;
+ struct bch_devs_list devs = { .nr = 0 };
struct list_head *where;
size_t bytes = vstruct_bytes(j);
__le64 last_seq;
@@ -60,8 +62,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
}
list_for_each_entry_reverse(i, jlist->head, list) {
- /* Duplicate? */
- if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+ if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
+ where = &i->list;
+ goto add;
+ }
+ }
+
+ where = jlist->head;
+add:
+ i = where->next != jlist->head
+ ? container_of(where->next, struct journal_replay, list)
+ : NULL;
+
+ /*
+ * Duplicate journal entries? If so we want the one that didn't have a
+ * checksum error:
+ */
+ if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+ if (i->bad) {
+ devs = i->devs;
+ list_del(&i->list);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
+ } else if (bad) {
+ goto found;
+ } else {
fsck_err_on(bytes != vstruct_bytes(&i->j) ||
memcmp(j, &i->j, bytes), c,
"found duplicate but non identical journal entries (seq %llu)",
@@ -69,14 +94,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
goto found;
}
- if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
- where = &i->list;
- goto add;
- }
}
- where = jlist->head;
-add:
i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i) {
ret = -ENOMEM;
@@ -84,7 +103,8 @@ add:
}
list_add(&i->list, where);
- i->devs.nr = 0;
+ i->devs = devs;
+ i->bad = bad;
memcpy(&i->j, j, bytes);
found:
if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@@ -391,6 +411,7 @@ fsck_err:
}
static int jset_validate(struct bch_fs *c,
+ struct bch_dev *ca,
struct jset *jset, u64 sector,
unsigned bucket_sectors_left,
unsigned sectors_read,
@@ -405,16 +426,19 @@ static int jset_validate(struct bch_fs *c,
return JOURNAL_ENTRY_NONE;
version = le32_to_cpu(jset->version);
- if ((version != BCH_JSET_VERSION_OLD &&
- version < bcachefs_metadata_version_min) ||
- version >= bcachefs_metadata_version_max) {
- bch_err(c, "unknown journal entry version %u", jset->version);
- return BCH_FSCK_UNKNOWN_VERSION;
+ if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
+ version < bcachefs_metadata_version_min) ||
+ version >= bcachefs_metadata_version_max, c,
+ "%s sector %llu seq %llu: unknown journal entry version %u",
+ ca->name, sector, le64_to_cpu(jset->seq),
+ version)) {
+ /* XXX: note we might have missing journal entries */
+ return JOURNAL_ENTRY_BAD;
}
if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
- "journal entry too big (%zu bytes), sector %lluu",
- bytes, sector)) {
+ "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+ ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
/* XXX: note we might have missing journal entries */
return JOURNAL_ENTRY_BAD;
}
@@ -423,13 +447,15 @@ static int jset_validate(struct bch_fs *c,
return JOURNAL_ENTRY_REREAD;
if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
- "journal entry with unknown csum type %llu sector %lluu",
- JSET_CSUM_TYPE(jset), sector))
+ "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
+ ca->name, sector, le64_to_cpu(jset->seq),
+ JSET_CSUM_TYPE(jset)))
return JOURNAL_ENTRY_BAD;
csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
- "journal checksum bad, sector %llu", sector)) {
+ "%s sector %llu seq %llu: journal checksum bad",
+ ca->name, sector, le64_to_cpu(jset->seq))) {
/* XXX: retry IO, when we start retrying checksum errors */
/* XXX: note we might have missing journal entries */
return JOURNAL_ENTRY_BAD;
@@ -440,8 +466,10 @@ static int jset_validate(struct bch_fs *c,
vstruct_end(jset) - (void *) jset->encrypted_start);
if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
- "invalid journal entry: last_seq > seq"))
+ "invalid journal entry: last_seq > seq")) {
jset->last_seq = jset->seq;
+ return JOURNAL_ENTRY_BAD;
+ }
return 0;
fsck_err:
@@ -516,11 +544,12 @@ reread:
j = buf->data;
}
- ret = jset_validate(c, j, offset,
+ ret = jset_validate(c, ca, j, offset,
end - offset, sectors_read,
READ);
switch (ret) {
case BCH_FSCK_OK:
+ sectors = vstruct_sectors(j, c->block_bits);
break;
case JOURNAL_ENTRY_REREAD:
if (vstruct_bytes(j) > buf->size) {
@@ -537,8 +566,13 @@ reread:
goto next_block;
case JOURNAL_ENTRY_BAD:
saw_bad = true;
+ /*
+ * On checksum error we don't really trust the size
+ * field of the journal entry we read, so try reading
+ * again at next block boundary:
+ */
sectors = c->opts.block_size;
- goto next_block;
+ break;
default:
return ret;
}
@@ -555,7 +589,7 @@ reread:
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
mutex_lock(&jlist->lock);
- ret = journal_entry_add(c, ca, jlist, j);
+ ret = journal_entry_add(c, ca, jlist, j, ret != 0);
mutex_unlock(&jlist->lock);
switch (ret) {
@@ -566,8 +600,6 @@ reread:
default:
return ret;
}
-
- sectors = vstruct_sectors(j, c->block_bits);
next_block:
pr_debug("next");
offset += sectors;
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 72e575f360af..6958ee0f8cf2 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -9,6 +9,8 @@
struct journal_replay {
struct list_head list;
struct bch_devs_list devs;
+ /* checksum error, but we may want to try using it anyways: */
+ bool bad;
/* must be last: */
struct jset j;
};
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index a21de0088753..d0f1bbf8f6a7 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -36,15 +36,6 @@
* that bset, until that btree node is rewritten.
*/
-static unsigned
-blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
-{
- return bl
- ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
- sizeof(struct journal_seq_blacklist_entry))
- : 0;
-}
-
static unsigned sb_blacklist_u64s(unsigned nr)
{
struct bch_sb_field_journal_seq_blacklist *bl;
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index 03f4b97247fd..afb886ec8e25 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -2,6 +2,15 @@
#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+static inline unsigned
+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
+{
+ return bl
+ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
+ sizeof(struct journal_seq_blacklist_entry))
+ : 0;
+}
+
bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
int bch2_blacklist_table_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 4a2c4debd3f0..2f3be487ef65 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -320,12 +320,12 @@ static void move_free(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
if (bv->bv_page)
__free_page(bv->bv_page);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 55aa463f992f..de0a7974ec9f 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -44,13 +44,6 @@
#define COPYGC_BUCKETS_PER_ITER(ca) \
((ca)->free[RESERVE_MOVINGGC].size / 2)
-static inline int sectors_used_cmp(copygc_heap *heap,
- struct copygc_heap_entry l,
- struct copygc_heap_entry r)
-{
- return cmp_int(l.sectors, r.sectors);
-}
-
static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
{
const struct copygc_heap_entry *l = _l;
@@ -123,6 +116,13 @@ static bool have_copygc_reserve(struct bch_dev *ca)
return ret;
}
+static inline int fragmentation_cmp(copygc_heap *heap,
+ struct copygc_heap_entry l,
+ struct copygc_heap_entry r)
+{
+ return cmp_int(l.fragmentation, r.fragmentation);
+}
+
static int bch2_copygc(struct bch_fs *c)
{
copygc_heap *h = &c->copygc_heap;
@@ -180,10 +180,12 @@ static int bch2_copygc(struct bch_fs *c)
e = (struct copygc_heap_entry) {
.dev = dev_idx,
.gen = m.gen,
+ .fragmentation = bucket_sectors_used(m) * (1U << 15)
+ / ca->mi.bucket_size,
.sectors = bucket_sectors_used(m),
.offset = bucket_to_sector(ca, b),
};
- heap_add_or_replace(h, e, -sectors_used_cmp, NULL);
+ heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
}
up_read(&ca->bucket_lock);
}
@@ -197,7 +199,7 @@ static int bch2_copygc(struct bch_fs *c)
sectors_to_move += i->sectors;
while (sectors_to_move > sectors_reserved) {
- BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL));
+ BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
sectors_to_move -= e.sectors;
}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index d6a832a38b20..014c608ca0c6 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -83,7 +83,7 @@ enum opt_type {
"size", NULL) \
x(btree_node_size, u16, \
OPT_FORMAT, \
- OPT_SECTORS(1, 128), \
+ OPT_SECTORS(1, 512), \
BCH_SB_BTREE_NODE_SIZE, 512, \
"size", "Btree node size, default 256k") \
x(errors, u8, \
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 28972f30e198..6e829bf0a31f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1039,6 +1039,11 @@ int bch2_fs_recovery(struct bch_fs *c)
}
journal_seq += 4;
+
+ /*
+ * The superblock needs to be written before we do any btree
+ * node writes: it will be in the read_write() path
+ */
}
ret = bch2_blacklist_table_initialize(c);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 1d9a6bfa8c13..30be083b09bf 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -169,10 +169,9 @@ int bch2_congested(void *data, int bdi_bits)
}
}
} else {
- unsigned target = READ_ONCE(c->opts.foreground_target);
- const struct bch_devs_mask *devs = target
- ? bch2_target_to_mask(c, target)
- : &c->rw_devs[BCH_DATA_user];
+ const struct bch_devs_mask *devs =
+ bch2_target_to_mask(c, c->opts.foreground_target) ?:
+ &c->rw_devs[BCH_DATA_user];
for_each_member_device_rcu(ca, c, i, devs) {
bdi = ca->disk_sb.bdev->bd_bdi;
@@ -384,8 +383,8 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
{
bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
- bch2_fs_read_only_async(c);
bch2_journal_halt(&c->journal);
+ bch2_fs_read_only_async(c);
wake_up(&bch_read_only_wait);
return ret;
@@ -442,6 +441,13 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
if (ret)
goto err;
+ /*
+ * We need to write out a journal entry before we start doing btree
+ * updates, to ensure that on unclean shutdown new journal blacklist
+ * entries are created:
+ */
+ bch2_journal_meta(&c->journal);
+
clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
for_each_rw_member(ca, c, i)
@@ -1820,7 +1826,6 @@ err:
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
{
-
struct block_device *bdev = lookup_bdev(path);
struct bch_dev *ca;
unsigned i;
@@ -1845,6 +1850,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
{
struct bch_sb_handle *sb = NULL;
struct bch_fs *c = NULL;
+ struct bch_sb_field_members *mi;
unsigned i, best_sb = 0;
const char *err;
int ret = -ENOMEM;
@@ -1880,10 +1886,24 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
le64_to_cpu(sb[best_sb].sb->seq))
best_sb = i;
- for (i = 0; i < nr_devices; i++) {
+ mi = bch2_sb_get_members(sb[best_sb].sb);
+
+ i = 0;
+ while (i < nr_devices) {
+ if (i != best_sb &&
+ !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) {
+ char buf[BDEVNAME_SIZE];
+ pr_info("%s has been removed, skipping",
+ bdevname(sb[i].bdev, buf));
+ bch2_free_super(&sb[i]);
+ array_remove_item(sb, nr_devices, i);
+ continue;
+ }
+
err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
if (err)
goto err_print;
+ i++;
}
ret = -ENOMEM;
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 4aa5dd7917cf..fffee96726ce 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -222,6 +222,15 @@ void bch2_fs_read_only(struct bch_fs *);
int bch2_fs_read_write(struct bch_fs *);
int bch2_fs_read_write_early(struct bch_fs *);
+/*
+ * Only for use in the recovery/fsck path:
+ */
+static inline void bch2_fs_lazy_rw(struct bch_fs *c)
+{
+ if (percpu_ref_is_zero(&c->writes))
+ bch2_fs_read_write_early(c);
+}
+
void bch2_fs_stop(struct bch_fs *);
int bch2_fs_start(struct bch_fs *);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 119c86122023..f48c6380684f 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -99,7 +99,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
{
return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
get_order(size)) ?:
- __vmalloc(size, gfp_mask, PAGE_KERNEL);
+ __vmalloc(size, gfp_mask);
}
static inline void kvpfree(void *p, size_t size)
@@ -664,35 +664,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
- while (iter->bi_size) {
- struct bio_vec next = bio_iter_iovec(bio, *iter);
-
- if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
- page_address(next.bv_page) + next.bv_offset)
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter(bio, iter, next.bv_len);
- }
-#endif
- return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter) \
- __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 725a6f3ef8ce..21f64cb7e402 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -511,7 +511,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
mutex_lock(&inode->ei_update_lock);
if (inode_opt_id == Inode_opt_project) {
- ret = bch2_set_projid(c, inode, s.v);
+ /*
+ * inode fields accessible via the xattr interface are stored
+ * with a +1 bias, so that 0 means unset:
+ */
+ ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
if (ret)
goto err;
}