summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2024-09-07 21:01:09 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2024-09-07 21:14:35 -0400
commitcd35891eb95ee8b1d7512eda06d1218eacae3842 (patch)
tree209931b6a1d96aa96dc5a9c40ea2bf2e5a3ad636
parentf9ec00d5ca00146dde43382a8ef234589ae129c1 (diff)
Update bcachefs sources to ec2ddb95112b bcachefs: bch2_opts_to_text()
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/swap.h7
-rw-r--r--include/linux/time64.h14
-rw-r--r--include/linux/types.h2
-rw-r--r--libbcachefs/acl.c5
-rw-r--r--libbcachefs/alloc_background.c66
-rw-r--r--libbcachefs/alloc_background.h1
-rw-r--r--libbcachefs/alloc_foreground.c54
-rw-r--r--libbcachefs/alloc_foreground.h5
-rw-r--r--libbcachefs/backpointers.c101
-rw-r--r--libbcachefs/bcachefs.h12
-rw-r--r--libbcachefs/btree_cache.c271
-rw-r--r--libbcachefs/btree_cache.h3
-rw-r--r--libbcachefs/btree_gc.c21
-rw-r--r--libbcachefs/btree_io.c8
-rw-r--r--libbcachefs/btree_io.h4
-rw-r--r--libbcachefs/btree_iter.h54
-rw-r--r--libbcachefs/btree_journal_iter.c2
-rw-r--r--libbcachefs/btree_key_cache.c8
-rw-r--r--libbcachefs/btree_types.h57
-rw-r--r--libbcachefs/btree_update_interior.c11
-rw-r--r--libbcachefs/buckets.c42
-rw-r--r--libbcachefs/buckets.h15
-rw-r--r--libbcachefs/buckets_types.h8
-rw-r--r--libbcachefs/checksum.c101
-rw-r--r--libbcachefs/data_update.c3
-rw-r--r--libbcachefs/ec.c446
-rw-r--r--libbcachefs/ec.h15
-rw-r--r--libbcachefs/ec_format.h9
-rw-r--r--libbcachefs/ec_types.h1
-rw-r--r--libbcachefs/errcode.h15
-rw-r--r--libbcachefs/extents.c77
-rw-r--r--libbcachefs/extents.h43
-rw-r--r--libbcachefs/fs-common.c5
-rw-r--r--libbcachefs/fs-io-buffered.c149
-rw-r--r--libbcachefs/fs.c136
-rw-r--r--libbcachefs/io_read.c12
-rw-r--r--libbcachefs/io_write.c7
-rw-r--r--libbcachefs/journal_io.c3
-rw-r--r--libbcachefs/journal_reclaim.c7
-rw-r--r--libbcachefs/opts.c29
-rw-r--r--libbcachefs/opts.h14
-rw-r--r--libbcachefs/rcu_pending.c4
-rw-r--r--libbcachefs/rebalance.c3
-rw-r--r--libbcachefs/recovery.c22
-rw-r--r--libbcachefs/recovery_passes.c10
-rw-r--r--libbcachefs/replicas.c13
-rw-r--r--libbcachefs/replicas_format.h9
-rw-r--r--libbcachefs/sb-errors_format.h8
-rw-r--r--libbcachefs/sb-members.c51
-rw-r--r--libbcachefs/sb-members.h20
-rw-r--r--libbcachefs/sb-members_format.h5
-rw-r--r--libbcachefs/str_hash.h2
-rw-r--r--libbcachefs/super-io.c2
-rw-r--r--libbcachefs/super.c82
-rw-r--r--libbcachefs/sysfs.c19
-rw-r--r--libbcachefs/util.c14
-rw-r--r--libbcachefs/xattr_format.h2
59 files changed, 1293 insertions, 820 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 485f92e1..ec5195e1 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-22fa8fc32e6aafb8bd76c6b746868dbdbc6a934d
+ec2ddb95112b8967753591b16e2e439eee76c5b1
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 99d6a47a..153bd73d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -65,6 +65,8 @@
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
+#define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */
+#define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
diff --git a/include/linux/swap.h b/include/linux/swap.h
new file mode 100644
index 00000000..81864222
--- /dev/null
+++ b/include/linux/swap.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SWAP_H
+#define _LINUX_SWAP_H
+
+static inline void mm_account_reclaimed_pages(unsigned long pages) {}
+
+#endif /* _LINUX_SWAP_H */
diff --git a/include/linux/time64.h b/include/linux/time64.h
index cd6cc1c1..0cef3f8c 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -44,6 +44,20 @@ static inline struct timespec timespec_trunc(struct timespec t, unsigned gran)
return t;
}
+static inline void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
+{
+ while (nsec >= NSEC_PER_SEC) {
+ nsec -= NSEC_PER_SEC;
+ ++sec;
+ }
+ while (nsec < 0) {
+ nsec += NSEC_PER_SEC;
+ --sec;
+ }
+ ts->tv_sec = sec;
+ ts->tv_nsec = nsec;
+}
+
#define ns_to_timespec64 ns_to_timespec
#define timespec64_to_ns timespec_to_ns
#define timespec64_trunc timespec_trunc
diff --git a/include/linux/types.h b/include/linux/types.h
index 004d5eb0..5ee5ebc6 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -37,6 +37,8 @@ typedef unsigned gfp_t;
#define __GFP_NOWARN 0
#define __GFP_NORETRY 0
#define __GFP_NOFAIL 0
+#define __GFP_ACCOUNT 0
+#define __GFP_RECLAIMABLE 0
#define __GFP_ZERO 1
#define GFP_KERNEL 2
diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c
index 87f1be9d..1def6187 100644
--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@@ -137,7 +137,7 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
return NULL;
acl = allocate_dropping_locks(trans, ret,
- posix_acl_alloc(count, _gfp));
+ posix_acl_alloc(count, GFP_KERNEL));
if (!acl)
return ERR_PTR(-ENOMEM);
if (ret) {
@@ -427,7 +427,8 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
if (ret)
goto err;
- ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode));
+ ret = allocate_dropping_locks_errcode(trans,
+ __posix_acl_chmod(&acl, GFP_KERNEL, mode));
if (ret)
goto err;
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index b54ce7f8..51a01423 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -1969,8 +1969,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
break;
}
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
percpu_ref_put(&ca->io_ref);
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
@@ -1980,18 +1980,18 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
if (discard_in_flight_add(ca, bucket, false))
return;
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
return;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
- goto put_ioref;
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ goto put_ref;
if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
return;
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
-put_ioref:
percpu_ref_put(&ca->io_ref);
+put_ref:
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
static int invalidate_one_bucket(struct btree_trans *trans,
@@ -2133,26 +2133,26 @@ static void bch2_do_invalidates_work(struct work_struct *work)
bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
percpu_ref_put(&ca->io_ref);
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
void bch2_dev_do_invalidates(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
return;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
- goto put_ioref;
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ goto put_ref;
if (queue_work(c->write_ref_wq, &ca->invalidate_work))
return;
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
-put_ioref:
percpu_ref_put(&ca->io_ref);
+put_ref:
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
void bch2_do_invalidates(struct bch_fs *c)
@@ -2298,6 +2298,36 @@ int bch2_fs_freespace_init(struct bch_fs *c)
return 0;
}
+/* device removal */
+
+int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct bpos start = POS(ca->dev_idx, 0);
+ struct bpos end = POS(ca->dev_idx, U64_MAX);
+ int ret;
+
+ /*
+ * We clear the LRU and need_discard btrees first so that we don't race
+ * with bch2_do_invalidates() and bch2_do_discards()
+ */
+ ret = bch2_dev_remove_stripes(c, ca) ?:
+ bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_dev_usage_remove(c, ca->dev_idx);
+ bch_err_msg(c, ret, "removing dev alloc info");
+ return ret;
+}
+
/* Bucket IO clocks: */
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
@@ -2433,13 +2463,15 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
/* device goes ro: */
void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned i;
+ lockdep_assert_held(&c->state_lock);
/* First, remove device from allocation groups: */
- for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
clear_bit(ca->dev_idx, c->rw_devs[i].d);
+ c->rw_devs_change_count++;
+
/*
* Capacity is calculated based off of devices in allocation groups:
*/
@@ -2468,11 +2500,13 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
/* device goes rw: */
void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned i;
+ lockdep_assert_held(&c->state_lock);
- for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
if (ca->mi.data_allowed & (1 << i))
set_bit(ca->dev_idx, c->rw_devs[i].d);
+
+ c->rw_devs_change_count++;
}
void bch2_dev_allocator_background_exit(struct bch_dev *ca)
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index fd790b03..577f823a 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -338,6 +338,7 @@ static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct
int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
int bch2_fs_freespace_init(struct bch_fs *);
+int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
void bch2_recalc_capacity(struct bch_fs *);
u64 bch2_min_rw_member_capacity(struct bch_fs *);
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index 084b03b8..d0e0b568 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -600,6 +600,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
enum bch_watermark watermark,
enum bch_data_type data_type,
struct closure *cl,
+ bool nowait,
struct bch_dev_usage *usage)
{
struct bch_fs *c = trans->c;
@@ -609,7 +610,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
struct bucket_alloc_state s = {
.btree_bitmap = data_type == BCH_DATA_btree,
};
- bool waiting = false;
+ bool waiting = nowait;
again:
bch2_dev_usage_read_fast(ca, usage);
avail = dev_buckets_free(ca, *usage, watermark);
@@ -685,7 +686,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
bch2_trans_do(c, NULL, NULL, 0,
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
- data_type, cl, &usage)));
+ data_type, cl, false, &usage)));
return ob;
}
@@ -748,7 +749,6 @@ static int add_new_bucket(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- unsigned flags,
struct open_bucket *ob)
{
unsigned durability = ob_dev(c, ob)->mi.durability;
@@ -775,7 +775,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- unsigned flags,
+ enum bch_write_flags flags,
enum bch_data_type data_type,
enum bch_watermark watermark,
struct closure *cl)
@@ -801,7 +801,8 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
continue;
}
- ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage);
+ ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
+ cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
if (!IS_ERR(ob))
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
bch2_dev_put(ca);
@@ -815,7 +816,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
if (add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob)) {
+ have_cache, ob)) {
ret = 0;
break;
}
@@ -841,7 +842,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *cl)
{
struct bch_fs *c = trans->c;
@@ -883,7 +884,7 @@ got_bucket:
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob);
+ have_cache, ob);
out_put_head:
bch2_ec_stripe_head_put(c, h);
return ret;
@@ -922,7 +923,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- bool ec, unsigned flags)
+ bool ec)
{
struct open_buckets ptrs_skip = { .nr = 0 };
struct open_bucket *ob;
@@ -934,7 +935,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c,
have_cache, ec, ob))
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob);
+ have_cache, ob);
else
ob_push(c, &ptrs_skip, ob);
}
@@ -950,8 +951,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache, bool ec,
- enum bch_watermark watermark,
- unsigned flags)
+ enum bch_watermark watermark)
{
int i, ret = 0;
@@ -983,7 +983,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob);
+ have_cache, ob);
if (ret)
break;
}
@@ -1003,7 +1003,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *_cl)
{
struct bch_fs *c = trans->c;
@@ -1024,13 +1024,13 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
- have_cache, erasure_code, flags);
+ have_cache, erasure_code);
if (ret)
return ret;
ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
- have_cache, erasure_code, watermark, flags);
+ have_cache, erasure_code, watermark);
if (ret)
return ret;
@@ -1071,7 +1071,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *cl)
{
int ret;
@@ -1373,7 +1373,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
unsigned nr_replicas,
unsigned nr_replicas_required,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *cl,
struct write_point **wp_ret)
{
@@ -1389,8 +1389,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
erasure_code = false;
- BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
-
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
ptrs.nr = 0;
@@ -1495,11 +1493,12 @@ err:
try_decrease_writepoints(trans, write_points_nr))
goto retry;
- if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
+ if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ ret = -BCH_ERR_bucket_alloc_blocked;
+
+ if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) &&
bch2_err_matches(ret, BCH_ERR_freelist_empty))
- return cl
- ? -BCH_ERR_bucket_alloc_blocked
- : -BCH_ERR_ENOSPC_bucket_alloc;
+ ret = -BCH_ERR_bucket_alloc_blocked;
return ret;
}
@@ -1730,13 +1729,6 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].data_type]++;
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 12);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
-
bch2_dev_usage_to_text(out, ca, &stats);
prt_newline(out);
diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h
index 386d231c..1a16fd5b 100644
--- a/libbcachefs/alloc_foreground.h
+++ b/libbcachefs/alloc_foreground.h
@@ -155,9 +155,10 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
return ret;
}
+enum bch_write_flags;
int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
- unsigned, unsigned *, bool *, unsigned,
+ unsigned, unsigned *, bool *, enum bch_write_flags,
enum bch_data_type, enum bch_watermark,
struct closure *);
@@ -167,7 +168,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *,
struct bch_devs_list *,
unsigned, unsigned,
enum bch_watermark,
- unsigned,
+ enum bch_write_flags,
struct closure *,
struct write_point **);
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c
index d4da6343..f1862c3f 100644
--- a/libbcachefs/backpointers.c
+++ b/libbcachefs/backpointers.c
@@ -3,12 +3,14 @@
#include "bbpos.h"
#include "alloc_background.h"
#include "backpointers.h"
+#include "bbpos.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "checksum.h"
+#include "disk_accounting.h"
#include "error.h"
#include <linux/mm.h>
@@ -750,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0;
+ bch2_btree_cache_unpin(c);
+
btree_interior_mask |= btree_leaf_mask;
- c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
- c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
+ c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
+ c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
c->btree_cache.pinned_nodes_start = start;
c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
@@ -775,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
BBPOS(btree, b->key.k.p);
break;
}
+ bch2_node_pin(c, b);
0;
}));
}
@@ -782,12 +787,80 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
return ret;
}
+struct progress_indicator_state {
+ unsigned long next_print;
+ u64 nodes_seen;
+ u64 nodes_total;
+ struct btree *last_node;
+};
+
+static inline void progress_init(struct progress_indicator_state *s,
+ struct bch_fs *c,
+ u64 btree_id_mask)
+{
+ memset(s, 0, sizeof(*s));
+
+ s->next_print = jiffies + HZ * 10;
+
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
+ if (!(btree_id_mask & BIT_ULL(i)))
+ continue;
+
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_btree,
+ .btree.id = i,
+ };
+
+ u64 v;
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
+ s->nodes_total += div64_ul(v, btree_sectors(c));
+ }
+}
+
+static inline bool progress_update_p(struct progress_indicator_state *s)
+{
+ bool ret = time_after_eq(jiffies, s->next_print);
+
+ if (ret)
+ s->next_print = jiffies + HZ * 10;
+ return ret;
+}
+
+static void progress_update_iter(struct btree_trans *trans,
+ struct progress_indicator_state *s,
+ struct btree_iter *iter,
+ const char *msg)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b = path_l(btree_iter_path(trans, iter))->b;
+
+ s->nodes_seen += b != s->last_node;
+ s->last_node = b;
+
+ if (progress_update_p(s)) {
+ struct printbuf buf = PRINTBUF;
+ unsigned percent = s->nodes_total
+ ? div64_u64(s->nodes_seen * 100, s->nodes_total)
+ : 0;
+
+ prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
+ msg, percent, s->nodes_seen, s->nodes_total);
+ bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
+
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct extents_to_bp_state *s)
{
struct bch_fs *c = trans->c;
+ struct progress_indicator_state progress;
int ret = 0;
+ progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
+
for (enum btree_id btree_id = 0;
btree_id < btree_id_nr_alive(c);
btree_id++) {
@@ -805,6 +878,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
BTREE_ITER_prefetch);
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}));
@@ -865,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
- c->btree_cache.pinned_nodes_leaf_mask = 0;
- c->btree_cache.pinned_nodes_interior_mask = 0;
+ bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;
@@ -920,19 +993,24 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
{
+ struct bch_fs *c = trans->c;
struct bkey_buf last_flushed;
+ struct progress_indicator_state progress;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
+ progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_one_backpointer(trans, start, end,
- bkey_s_c_to_backpointer(k),
- &last_flushed));
-
- bch2_bkey_buf_exit(&last_flushed, trans->c);
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
+ check_one_backpointer(trans, start, end,
+ bkey_s_c_to_backpointer(k),
+ &last_flushed);
+ }));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
return ret;
}
@@ -977,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
- c->btree_cache.pinned_nodes_leaf_mask = 0;
- c->btree_cache.pinned_nodes_interior_mask = 0;
+ bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index d43bbdba..c711d4c2 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -542,7 +542,7 @@ struct bch_dev {
* gc_gens_lock, for device resize - holding any is sufficient for
* access: Or rcu_read_lock(), but only for dev_ptr_stale():
*/
- struct bucket_array __rcu *buckets_gc;
+ GENRADIX(struct bucket) buckets_gc;
struct bucket_gens __rcu *bucket_gens;
u8 *oldest_gen;
unsigned long *buckets_nouse;
@@ -871,6 +871,7 @@ struct bch_fs {
/* ALLOCATION */
struct bch_devs_mask rw_devs[BCH_DATA_NR];
+ unsigned long rw_devs_change_count;
u64 capacity; /* sectors */
u64 reserved; /* sectors */
@@ -1045,8 +1046,6 @@ struct bch_fs {
* for signaling to the toplevel code which pass we want to run now.
*/
enum bch_recovery_pass curr_recovery_pass;
- /* bitmap of explicitly enabled recovery passes: */
- u64 recovery_passes_explicit;
/* bitmask of recovery passes that we actually ran */
u64 recovery_passes_complete;
/* never rewinds version of curr_recovery_pass */
@@ -1195,12 +1194,15 @@ static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
{
struct timespec64 t;
+ s64 sec;
s32 rem;
time += c->sb.time_base_lo;
- t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
- t.tv_nsec = rem * c->sb.nsec_per_time_unit;
+ sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
+
+ set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit);
+
return t;
}
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 662f0f79..7b951b27 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -15,11 +15,12 @@
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
+#include <linux/swap.h>
#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
do { \
if (shrinker_counter) \
- bc->not_freed_##counter++; \
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \
} while (0)
const char * const bch2_btree_node_flags[] = {
@@ -31,24 +32,29 @@ const char * const bch2_btree_node_flags[] = {
void bch2_recalc_btree_reserve(struct bch_fs *c)
{
- unsigned i, reserve = 16;
+ unsigned reserve = 16;
if (!c->btree_roots_known[0].b)
reserve += 8;
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
reserve += min_t(unsigned, 1, r->b->c.level) * 8;
}
- c->btree_cache.reserve = reserve;
+ c->btree_cache.nr_reserve = reserve;
}
-static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+static inline size_t btree_cache_can_free(struct btree_cache_list *list)
{
- return max_t(int, 0, bc->used - bc->reserve);
+ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+
+ size_t can_free = list->nr;
+ if (!list->idx)
+ can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
+ return can_free;
}
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
@@ -63,6 +69,18 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
+ BUG_ON(btree_node_hashed(b));
+
+ /*
+ * This should really be done in slub/vmalloc, but we're using the
+ * kmalloc_large() path, so we're working around a slub bug by doing
+ * this here:
+ */
+ if (b->data)
+ mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
+ if (b->aux_data)
+ mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
+
EBUG_ON(btree_node_write_in_flight(b));
clear_btree_node_just_written(b);
@@ -76,7 +94,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
#endif
b->aux_data = NULL;
- bc->used--;
+ bc->nr_freeable--;
btree_node_to_freedlist(bc, b);
}
@@ -102,6 +120,8 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
+ gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
+
b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
@@ -154,7 +174,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
bch2_btree_lock_init(&b->c, 0);
- bc->used++;
+ bc->nr_freeable++;
list_add(&b->list, &bc->freeable);
return b;
}
@@ -169,10 +189,56 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
six_unlock_intent(&b->c.lock);
}
+static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
+{
+ struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+ u64 mask = bc->pinned_nodes_mask[!!b->c.level];
+
+ return ((mask & BIT_ULL(b->c.btree_id)) &&
+ bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+ bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
+}
+
+void bch2_node_pin(struct bch_fs *c, struct btree *b)
+{
+ struct btree_cache *bc = &c->btree_cache;
+
+ mutex_lock(&bc->lock);
+ BUG_ON(!__btree_node_pinned(bc, b));
+ if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
+ set_btree_node_pinned(b);
+ list_move(&b->list, &bc->live[1].list);
+ bc->live[0].nr--;
+ bc->live[1].nr++;
+ }
+ mutex_unlock(&bc->lock);
+}
+
+void bch2_btree_cache_unpin(struct bch_fs *c)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b, *n;
+
+ mutex_lock(&bc->lock);
+ c->btree_cache.pinned_nodes_mask[0] = 0;
+ c->btree_cache.pinned_nodes_mask[1] = 0;
+
+ list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
+ clear_btree_node_pinned(b);
+ list_move(&b->list, &bc->live[0].list);
+ bc->live[0].nr++;
+ bc->live[1].nr--;
+ }
+
+ mutex_unlock(&bc->lock);
+}
+
/* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
+ lockdep_assert_held(&bc->lock);
int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
BUG_ON(ret);
@@ -181,7 +247,11 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
b->hash_val = 0;
if (b->c.btree_id < BTREE_ID_NR)
- --bc->used_by_btree[b->c.btree_id];
+ --bc->nr_by_btree[b->c.btree_id];
+
+ bc->live[btree_node_pinned(b)].nr--;
+ bc->nr_freeable++;
+ list_move(&b->list, &bc->freeable);
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
@@ -191,23 +261,30 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
bch_btree_cache_params);
- if (!ret && b->c.btree_id < BTREE_ID_NR)
- bc->used_by_btree[b->c.btree_id]++;
- return ret;
+ if (ret)
+ return ret;
+
+ if (b->c.btree_id < BTREE_ID_NR)
+ bc->nr_by_btree[b->c.btree_id]++;
+
+ bool p = __btree_node_pinned(bc, b);
+ mod_bit(BTREE_NODE_pinned, &b->flags, p);
+
+ list_move_tail(&b->list, &bc->live[p].list);
+ bc->live[p].nr++;
+
+ bc->nr_freeable--;
+ return 0;
}
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
unsigned level, enum btree_id id)
{
- int ret;
-
b->c.level = level;
b->c.btree_id = id;
mutex_lock(&bc->lock);
- ret = __bch2_btree_node_hash_insert(bc, b);
- if (!ret)
- list_add_tail(&b->list, &bc->live);
+ int ret = __bch2_btree_node_hash_insert(bc, b);
mutex_unlock(&bc->lock);
return ret;
@@ -261,18 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b
int ret = 0;
lockdep_assert_held(&bc->lock);
-
- struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
-
- u64 mask = b->c.level
- ? bc->pinned_nodes_interior_mask
- : bc->pinned_nodes_leaf_mask;
-
- if ((mask & BIT_ULL(b->c.btree_id)) &&
- bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
- bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
-
wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
@@ -377,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = shrink->private_data;
- struct btree_cache *bc = &c->btree_cache;
+ struct btree_cache_list *list = shrink->private_data;
+ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
unsigned long can_free = 0;
@@ -386,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
unsigned long touched = 0;
unsigned i, flags;
unsigned long ret = SHRINK_STOP;
- bool trigger_writes = atomic_read(&bc->dirty) + nr >=
- bc->used * 3 / 4;
+ bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
@@ -402,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
* succeed, so that inserting keys into the btree can always succeed and
* IO can always make forward progress:
*/
- can_free = btree_cache_can_free(bc);
+ can_free = btree_cache_can_free(list);
nr = min_t(unsigned long, nr, can_free);
i = 0;
@@ -424,22 +489,24 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
- bc->freed++;
+ bc->nr_freed++;
}
}
restart:
- list_for_each_entry_safe(b, t, &bc->live, list) {
+ list_for_each_entry_safe(b, t, &list->list, list) {
touched++;
if (btree_node_accessed(b)) {
clear_btree_node_accessed(b);
- bc->not_freed_access_bit++;
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
+ --touched;;
} else if (!btree_node_reclaim(c, b, true)) {
+ bch2_btree_node_hash_remove(bc, b);
+
freed++;
btree_node_data_free(c, b);
- bc->freed++;
+ bc->nr_freed++;
- bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -450,7 +517,7 @@ restart:
!btree_node_will_make_reachable(b) &&
!btree_node_write_blocked(b) &&
six_trylock_read(&b->c.lock)) {
- list_move(&bc->live, &b->list);
+ list_move(&list->list, &b->list);
mutex_unlock(&bc->lock);
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_read(&b->c.lock);
@@ -464,8 +531,8 @@ restart:
break;
}
out_rotate:
- if (&t->list != &bc->live)
- list_move_tail(&bc->live, &t->list);
+ if (&t->list != &list->list)
+ list_move_tail(&list->list, &t->list);
out:
mutex_unlock(&bc->lock);
out_nounlock:
@@ -478,44 +545,45 @@ out_nounlock:
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = shrink->private_data;
- struct btree_cache *bc = &c->btree_cache;
+ struct btree_cache_list *list = shrink->private_data;
if (bch2_btree_shrinker_disabled)
return 0;
- return btree_cache_can_free(bc);
+ return btree_cache_can_free(list);
}
void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
- unsigned i, flags;
+ struct btree *b, *t;
+ unsigned long flags;
- shrinker_free(bc->shrink);
+ shrinker_free(bc->live[1].shrink);
+ shrinker_free(bc->live[0].shrink);
/* vfree() can allocate memory: */
flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
if (c->verify_data)
- list_move(&c->verify_data->list, &bc->live);
+ list_move(&c->verify_data->list, &bc->live[0].list);
kvfree(c->verify_ondisk);
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
- list_add(&r->b->list, &bc->live);
+ list_add(&r->b->list, &bc->live[0].list);
}
- list_splice(&bc->freeable, &bc->live);
-
- while (!list_empty(&bc->live)) {
- b = list_first_entry(&bc->live, struct btree, list);
+ list_for_each_entry_safe(b, t, &bc->live[1].list, list)
+ bch2_btree_node_hash_remove(bc, b);
+ list_for_each_entry_safe(b, t, &bc->live[0].list, list)
+ bch2_btree_node_hash_remove(bc, b);
+ list_for_each_entry_safe(b, t, &bc->freeable, list) {
BUG_ON(btree_node_read_in_flight(b) ||
btree_node_write_in_flight(b));
@@ -523,12 +591,11 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
}
BUG_ON(!bch2_journal_error(&c->journal) &&
- atomic_read(&c->btree_cache.dirty));
+ atomic_long_read(&c->btree_cache.nr_dirty));
list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
- while (!list_empty(&bc->freed_nonpcpu)) {
- b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
+ list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
list_del(&b->list);
six_lock_exit(&b->c.lock);
kfree(b);
@@ -537,6 +604,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
mutex_unlock(&bc->lock);
memalloc_nofs_restore(flags);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
+ BUG_ON(bc->nr_by_btree[i]);
+ BUG_ON(bc->live[0].nr);
+ BUG_ON(bc->live[1].nr);
+ BUG_ON(bc->nr_freeable);
+
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
}
@@ -556,22 +629,32 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bch2_recalc_btree_reserve(c);
- for (i = 0; i < bc->reserve; i++)
+ for (i = 0; i < bc->nr_reserve; i++)
if (!__bch2_btree_node_mem_alloc(c))
goto err;
- list_splice_init(&bc->live, &bc->freeable);
+ list_splice_init(&bc->live[0].list, &bc->freeable);
mutex_init(&c->verify_lock);
shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
if (!shrink)
goto err;
- bc->shrink = shrink;
+ bc->live[0].shrink = shrink;
+ shrink->count_objects = bch2_btree_cache_count;
+ shrink->scan_objects = bch2_btree_cache_scan;
+ shrink->seeks = 2;
+ shrink->private_data = &bc->live[0];
+ shrinker_register(shrink);
+
+ shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
+ if (!shrink)
+ goto err;
+ bc->live[1].shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
- shrink->seeks = 4;
- shrink->private_data = c;
+ shrink->seeks = 8;
+ shrink->private_data = &bc->live[1];
shrinker_register(shrink);
return 0;
@@ -582,7 +665,10 @@ err:
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
{
mutex_init(&bc->lock);
- INIT_LIST_HEAD(&bc->live);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
+ bc->live[i].idx = i;
+ INIT_LIST_HEAD(&bc->live[i].list);
+ }
INIT_LIST_HEAD(&bc->freeable);
INIT_LIST_HEAD(&bc->freed_pcpu);
INIT_LIST_HEAD(&bc->freed_nonpcpu);
@@ -644,14 +730,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_reclaim(c, b, false))
- return b;
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+ list_for_each_entry_reverse(b, &bc->live[i].list, list)
+ if (!btree_node_reclaim(c, b, false))
+ return b;
while (1) {
- list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_write_and_reclaim(c, b))
- return b;
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+ list_for_each_entry_reverse(b, &bc->live[i].list, list)
+ if (!btree_node_write_and_reclaim(c, b))
+ return b;
/*
* Rare case: all nodes were intent-locked.
@@ -716,14 +804,15 @@ got_node:
mutex_unlock(&bc->lock);
- if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
+ if (memalloc_flags_do(PF_MEMALLOC_NORECLAIM,
+ btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))) {
bch2_trans_unlock(trans);
if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
goto err;
}
mutex_lock(&bc->lock);
- bc->used++;
+ bc->nr_freeable++;
got_mem:
mutex_unlock(&bc->lock);
@@ -1264,8 +1353,8 @@ wait_on_io:
BUG_ON(btree_node_dirty(b));
mutex_lock(&bc->lock);
- btree_node_data_free(c, b);
bch2_btree_node_hash_remove(bc, b);
+ btree_node_data_free(c, b);
mutex_unlock(&bc->lock);
out:
six_unlock_write(&b->c.lock);
@@ -1337,13 +1426,20 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
}
static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
- const char *label, unsigned nr)
+ const char *label, size_t nr)
{
prt_printf(out, "%s\t", label);
prt_human_readable_u64(out, nr * c->opts.btree_node_size);
- prt_printf(out, " (%u)\n", nr);
+ prt_printf(out, " (%zu)\n", nr);
}
+static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
+#define x(n) #n,
+ BCH_BTREE_CACHE_NOT_FREED_REASONS()
+#undef x
+ NULL
+};
+
void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
@@ -1351,24 +1447,21 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
- prt_btree_cache_line(out, c, "total:", bc->used);
- prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty));
+ prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
+ prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
+ prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
+ prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
prt_newline(out);
- for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++)
- prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
+ prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]);
prt_newline(out);
- prt_printf(out, "freed:\t%u\n", bc->freed);
+ prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
prt_printf(out, "not freed:\n");
- prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty);
- prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight);
- prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight);
- prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent);
- prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write);
- prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit);
- prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict);
- prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked);
- prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
+ prt_printf(out, " %s\t%llu\n",
+ bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
}
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index f8206400..367acd21 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
+void bch2_node_pin(struct bch_fs *, struct btree *);
+void bch2_btree_cache_unpin(struct bch_fs *);
+
void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_i *);
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index eb3002c4..b5e0692f 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -549,9 +549,8 @@ reconstruct_root:
six_unlock_read(&b->c.lock);
if (ret == DROP_THIS_NODE) {
- bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_lock(&c->btree_cache.lock);
- list_move(&b->list, &c->btree_cache.freeable);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
r->b = NULL;
@@ -753,10 +752,8 @@ static void bch2_gc_free(struct bch_fs *c)
genradix_free(&c->reflink_gc_table);
genradix_free(&c->gc_stripes);
- for_each_member_device(c, ca) {
- kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
- ca->buckets_gc = NULL;
- }
+ for_each_member_device(c, ca)
+ genradix_free(&ca->buckets_gc);
}
static int bch2_gc_start(struct bch_fs *c)
@@ -910,20 +907,12 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
int ret = 0;
for_each_member_device(c, ca) {
- struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket),
- GFP_KERNEL|__GFP_ZERO);
- if (!buckets) {
+ ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
+ if (ret) {
bch2_dev_put(ca);
ret = -BCH_ERR_ENOMEM_gc_alloc_start;
break;
}
-
- buckets->first_bucket = ca->mi.first_bucket;
- buckets->nbuckets = ca->mi.nbuckets;
- buckets->nbuckets_minus_first =
- buckets->nbuckets - buckets->first_bucket;
- rcu_assign_pointer(ca->buckets_gc, buckets);
}
bch_err_fn(c, ret);
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 56ea9a77..cb48a947 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1666,7 +1666,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bch2_btree_pos_to_text(&buf, c, b);
bch_err_ratelimited(c, "%s", buf.buf);
- if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
+ if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
bch2_fatal_error(c);
@@ -1749,10 +1749,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
bch2_btree_node_read(trans, b, true);
if (btree_node_read_error(b)) {
- bch2_btree_node_hash_remove(&c->btree_cache, b);
-
mutex_lock(&c->btree_cache.lock);
- list_move(&b->list, &c->btree_cache.freeable);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
ret = -BCH_ERR_btree_node_read_error;
@@ -2031,7 +2029,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
do_write:
BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
- atomic_dec(&c->btree_cache.dirty);
+ atomic_long_dec(&c->btree_cache.nr_dirty);
BUG_ON(btree_node_fake(b));
BUG_ON((b->will_make_reachable != 0) != !b->written);
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 63d76f5c..9b01ca3d 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -18,13 +18,13 @@ struct btree_node_read_all;
static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
- atomic_inc(&c->btree_cache.dirty);
+ atomic_long_inc(&c->btree_cache.nr_dirty);
}
static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
- atomic_dec(&c->btree_cache.dirty);
+ atomic_long_dec(&c->btree_cache.nr_dirty);
}
static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 6d87e577..aec89e00 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -6,6 +6,8 @@
#include "btree_types.h"
#include "trace.h"
+#include <linux/sched/mm.h>
+
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
@@ -529,6 +531,12 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *);
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+/**
+ * bch2_trans_kmalloc - allocate memory for use by the current transaction
+ *
+ * Must be called after bch2_trans_begin, which on second and further calls
+ * frees all memory allocated in this transaction
+ */
static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
size = roundup(size, 8);
@@ -865,29 +873,33 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
(_do) ?: bch2_trans_relock(_trans); \
})
-#define allocate_dropping_locks_errcode(_trans, _do) \
-({ \
- gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
- int _ret = _do; \
- \
- if (bch2_err_matches(_ret, ENOMEM)) { \
- _gfp = GFP_KERNEL; \
- _ret = drop_locks_do(_trans, _do); \
- } \
- _ret; \
+#define memalloc_flags_do(_flags, _do) \
+({ \
+ unsigned _saved_flags = memalloc_flags_save(_flags); \
+ typeof(_do) _ret = _do; \
+ memalloc_noreclaim_restore(_saved_flags); \
+ _ret; \
})
-#define allocate_dropping_locks(_trans, _ret, _do) \
-({ \
- gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
- typeof(_do) _p = _do; \
- \
- _ret = 0; \
- if (unlikely(!_p)) { \
- _gfp = GFP_KERNEL; \
- _ret = drop_locks_do(_trans, ((_p = _do), 0)); \
- } \
- _p; \
+#define allocate_dropping_locks_errcode(_trans, _do) \
+({ \
+ int _ret = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, _do);\
+ \
+ if (bch2_err_matches(_ret, ENOMEM)) { \
+ _ret = drop_locks_do(_trans, _do); \
+ } \
+ _ret; \
+})
+
+#define allocate_dropping_locks(_trans, _ret, _do) \
+({ \
+ typeof(_do) _p = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, _do);\
+ \
+ _ret = 0; \
+ if (unlikely(!_p)) { \
+ _ret = drop_locks_do(_trans, ((_p = _do), 0)); \
+ } \
+ _p; \
})
#define bch2_trans_run(_c, _do) \
diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c
index 74933490..c1657182 100644
--- a/libbcachefs/btree_journal_iter.c
+++ b/libbcachefs/btree_journal_iter.c
@@ -530,6 +530,8 @@ static void __journal_keys_sort(struct journal_keys *keys)
{
sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
+ cond_resched();
+
struct journal_key *dst = keys->data;
darray_for_each(*keys, src) {
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 2e49ca71..4b2423b0 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -116,8 +116,10 @@ static void bkey_cached_free(struct btree_key_cache *bc,
this_cpu_inc(*bc->nr_pending);
}
-static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
+static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s)
{
+ gfp_t gfp = GFP_KERNEL|__GFP_ACCOUNT|__GFP_RECLAIMABLE;
+
struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
if (unlikely(!ck))
return NULL;
@@ -145,7 +147,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
goto lock;
ck = allocate_dropping_locks(trans, ret,
- __bkey_cached_alloc(key_u64s, _gfp));
+ __bkey_cached_alloc(key_u64s));
if (ret) {
if (ck)
kfree(ck->k);
@@ -239,7 +241,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
- kmalloc(key_u64s * sizeof(u64), _gfp));
+ kmalloc(key_u64s * sizeof(u64), GFP_KERNEL));
if (unlikely(!new_k)) {
bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(ck->key.btree_id), key_u64s);
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 0df07929..4568a41f 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -138,6 +138,31 @@ struct btree {
struct list_head list;
};
+#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \
+ x(lock_intent) \
+ x(lock_write) \
+ x(dirty) \
+ x(read_in_flight) \
+ x(write_in_flight) \
+ x(noevict) \
+ x(write_blocked) \
+ x(will_make_reachable) \
+ x(access_bit)
+
+enum bch_btree_cache_not_freed_reasons {
+#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
+ BCH_BTREE_CACHE_NOT_FREED_REASONS()
+#undef x
+ BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
+};
+
+struct btree_cache_list {
+ unsigned idx;
+ struct shrinker *shrink;
+ struct list_head list;
+ size_t nr;
+};
+
struct btree_cache {
struct rhashtable table;
bool table_init_done;
@@ -155,28 +180,19 @@ struct btree_cache {
* should never grow past ~2-3 nodes in practice.
*/
struct mutex lock;
- struct list_head live;
struct list_head freeable;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
+ struct btree_cache_list live[2];
- /* Number of elements in live + freeable lists */
- unsigned used;
- unsigned reserve;
- unsigned freed;
- unsigned not_freed_lock_intent;
- unsigned not_freed_lock_write;
- unsigned not_freed_dirty;
- unsigned not_freed_read_in_flight;
- unsigned not_freed_write_in_flight;
- unsigned not_freed_noevict;
- unsigned not_freed_write_blocked;
- unsigned not_freed_will_make_reachable;
- unsigned not_freed_access_bit;
- atomic_t dirty;
- struct shrinker *shrink;
+ size_t nr_freeable;
+ size_t nr_reserve;
+ size_t nr_by_btree[BTREE_ID_NR];
+ atomic_long_t nr_dirty;
- unsigned used_by_btree[BTREE_ID_NR];
+ /* shrinker stats */
+ size_t nr_freed;
+ u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
/*
* If we need to allocate memory for a new btree node and that
@@ -189,8 +205,8 @@ struct btree_cache {
struct bbpos pinned_nodes_start;
struct bbpos pinned_nodes_end;
- u64 pinned_nodes_leaf_mask;
- u64 pinned_nodes_interior_mask;
+ /* btree id mask: 0 for leaves, 1 for interior */
+ u64 pinned_nodes_mask[2];
};
struct btree_node_iter {
@@ -582,7 +598,8 @@ enum btree_write_type {
x(dying) \
x(fake) \
x(need_rewrite) \
- x(never_write)
+ x(never_write) \
+ x(pinned)
enum btree_flags {
/* First bits for btree node write type */
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 1433aefb..190bc1e8 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -16,6 +16,7 @@
#include "clock.h"
#include "error.h"
#include "extents.h"
+#include "io_write.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
@@ -145,7 +146,7 @@ fsck_err:
printbuf_exit(&buf);
return ret;
topology_repair:
- if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
+ if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
bch2_inconsistent_error(c);
ret = -BCH_ERR_btree_need_topology_repair;
@@ -250,8 +251,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
unsigned i, level = b->c.level;
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+
+ mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, b);
+ mutex_unlock(&c->btree_cache.lock);
+
__btree_node_free(trans, b);
+
six_unlock_write(&b->c.lock);
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
@@ -283,7 +289,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
clear_btree_node_need_write(b);
mutex_lock(&c->btree_cache.lock);
- list_del_init(&b->list);
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
@@ -1899,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
six_unlock_intent(&n->c.lock);
mutex_lock(&c->btree_cache.lock);
- list_add_tail(&b->list, &c->btree_cache.live);
+ list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
mutex_unlock(&c->btree_cache.lock);
bch2_trans_verify_locks(trans);
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index a2274429..aef58043 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -75,6 +75,15 @@ void bch2_dev_usage_to_text(struct printbuf *out,
struct bch_dev *ca,
struct bch_dev_usage *usage)
{
+ if (out->nr_tabstops < 5) {
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 12);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ }
+
prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
@@ -100,12 +109,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (!ca) {
- if (fsck_err(trans, ptr_to_invalid_device,
- "pointer to missing device %u\n"
- "while marking %s",
- p.ptr.dev,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
+ trans, ptr_to_invalid_device,
+ "pointer to missing device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
*do_update = true;
return 0;
}
@@ -476,7 +486,7 @@ out:
return ret;
err:
bch2_dump_trans_updates(trans);
- ret = -EIO;
+ ret = -BCH_ERR_bucket_ref_update;
goto out;
}
@@ -562,8 +572,8 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (unlikely(!ca)) {
- if (insert)
- ret = -EIO;
+ if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
+ ret = -BCH_ERR_trigger_pointer;
goto err;
}
@@ -592,7 +602,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
p.ptr.dev,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_trigger_pointer;
goto err_unlock;
}
@@ -637,7 +647,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
- ret = -EIO;
+ ret = -BCH_ERR_trigger_stripe_pointer;
goto err;
}
@@ -676,7 +686,7 @@ err:
(u64) p.ec.idx, buf.buf);
printbuf_exit(&buf);
bch2_inconsistent_error(c);
- return -EIO;
+ return -BCH_ERR_trigger_stripe_pointer;
}
m->block_sectors[p.ec.block] += sectors;
@@ -740,7 +750,7 @@ static int __trigger_extent(struct btree_trans *trans,
return ret;
} else if (!p.has_ec) {
*replicas_sectors += disk_sectors;
- acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
+ replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
if (ret)
@@ -876,7 +886,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
need_rebalance_delta -= s != 0;
need_rebalance_sectors_delta -= s;
- s = bch2_bkey_sectors_need_rebalance(c, old);
+ s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
need_rebalance_delta += s != 0;
need_rebalance_sectors_delta += s;
@@ -956,7 +966,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
bch2_data_type_str(a->v.data_type),
bch2_data_type_str(type),
bch2_data_type_str(type));
- ret = -EIO;
+ ret = -BCH_ERR_metadata_bucket_inconsistency;
goto err;
}
@@ -1012,7 +1022,7 @@ err:
bucket_unlock(g);
err_unlock:
percpu_up_read(&c->mark_lock);
- return -EIO;
+ return -BCH_ERR_metadata_bucket_inconsistency;
}
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index edbdffd5..e2cb7b24 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -80,22 +80,9 @@ static inline void bucket_lock(struct bucket *b)
TASK_UNINTERRUPTIBLE);
}
-static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
-{
- return rcu_dereference_check(ca->buckets_gc,
- !ca->fs ||
- percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->state_lock) ||
- lockdep_is_held(&ca->bucket_lock));
-}
-
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
- struct bucket_array *buckets = gc_bucket_array(ca);
-
- if (b - buckets->first_bucket >= buckets->nbuckets_minus_first)
- return NULL;
- return buckets->b + b;
+ return genradix_ptr(&ca->buckets_gc, b);
}
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index a19460a1..28bd09a2 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -19,14 +19,6 @@ struct bucket {
u32 stripe_sectors;
} __aligned(sizeof(long));
-struct bucket_array {
- struct rcu_head rcu;
- u16 first_bucket;
- size_t nbuckets;
- size_t nbuckets_minus_first;
- struct bucket b[] __counted_by(nbuckets);
-};
-
struct bucket_gens {
struct rcu_head rcu;
u16 first_bucket;
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index e7208bf1..ce8fc677 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -100,13 +100,12 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct scatterlist *sg, size_t len)
{
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
- int ret;
skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
- ret = crypto_skcipher_encrypt(req);
+ int ret = crypto_skcipher_encrypt(req);
if (ret)
pr_err("got error %i from crypto_skcipher_encrypt()", ret);
@@ -118,38 +117,47 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
void *buf, size_t len)
{
if (!is_vmalloc_addr(buf)) {
- struct scatterlist sg;
-
- sg_init_table(&sg, 1);
- sg_set_page(&sg,
- is_vmalloc_addr(buf)
- ? vmalloc_to_page(buf)
- : virt_to_page(buf),
- len, offset_in_page(buf));
+ struct scatterlist sg = {};
+
+ sg_mark_end(&sg);
+ sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf));
return do_encrypt_sg(tfm, nonce, &sg, len);
} else {
- unsigned pages = buf_pages(buf, len);
- struct scatterlist *sg;
- size_t orig_len = len;
- int ret, i;
-
- sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
- if (!sg)
- return -BCH_ERR_ENOMEM_do_encrypt;
+ DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
+ size_t sgl_len = 0;
+ int ret;
- sg_init_table(sg, pages);
+ darray_init(&sgl);
- for (i = 0; i < pages; i++) {
+ while (len) {
unsigned offset = offset_in_page(buf);
- unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
+ struct scatterlist sg = {
+ .page_link = (unsigned long) vmalloc_to_page(buf),
+ .offset = offset,
+ .length = min(len, PAGE_SIZE - offset),
+ };
- sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
- buf += pg_len;
- len -= pg_len;
+ if (darray_push(&sgl, sg)) {
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
+ if (ret)
+ goto err;
+
+ nonce = nonce_add(nonce, sgl_len);
+ sgl_len = 0;
+ sgl.nr = 0;
+ BUG_ON(darray_push(&sgl, sg));
+ }
+
+ buf += sg.length;
+ len -= sg.length;
+ sgl_len += sg.length;
}
- ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
- kfree(sg);
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
+err:
+ darray_exit(&sgl);
return ret;
}
}
@@ -325,39 +333,42 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
{
struct bio_vec bv;
struct bvec_iter iter;
- struct scatterlist sgl[16], *sg = sgl;
- size_t bytes = 0;
+ DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
+ size_t sgl_len = 0;
int ret = 0;
if (!bch2_csum_type_is_encryption(type))
return 0;
- sg_init_table(sgl, ARRAY_SIZE(sgl));
+ darray_init(&sgl);
bio_for_each_segment(bv, bio, iter) {
- if (sg == sgl + ARRAY_SIZE(sgl)) {
- sg_mark_end(sg - 1);
-
- ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ struct scatterlist sg = {
+ .page_link = (unsigned long) bv.bv_page,
+ .offset = bv.bv_offset,
+ .length = bv.bv_len,
+ };
+
+ if (darray_push(&sgl, sg)) {
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
if (ret)
- return ret;
+ goto err;
- nonce = nonce_add(nonce, bytes);
- bytes = 0;
+ nonce = nonce_add(nonce, sgl_len);
+ sgl_len = 0;
+ sgl.nr = 0;
- sg_init_table(sgl, ARRAY_SIZE(sgl));
- sg = sgl;
+ BUG_ON(darray_push(&sgl, sg));
}
- sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
- bytes += bv.bv_len;
- }
-
- if (sg != sgl) {
- sg_mark_end(sg - 1);
- return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ sgl_len += sg.length;
}
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
+err:
+ darray_exit(&sgl);
return ret;
}
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 65176d51..757b9884 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -337,6 +337,7 @@ restart_drop_extra_replicas:
printbuf_exit(&buf);
bch2_fatal_error(c);
+ ret = -EIO;
goto out;
}
@@ -570,7 +571,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
while (data_opts.kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
- bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+ bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
data_opts.kill_ptrs ^= 1U << drop;
}
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 141a4c63..6d8d5e6f 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -18,6 +18,7 @@
#include "ec.h"
#include "error.h"
#include "io_read.h"
+#include "io_write.h"
#include "keylist.h"
#include "recovery.h"
#include "replicas.h"
@@ -146,12 +147,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
bch2_prt_csum_type(out, s.csum_type);
prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+ if (s.disk_label) {
+ prt_str(out, " label");
+ bch2_disk_path_to_text(out, c, s.disk_label - 1);
+ }
+
for (unsigned i = 0; i < s.nr_blocks; i++) {
const struct bch_extent_ptr *ptr = sp->ptrs + i;
if ((void *) ptr >= bkey_val_end(k))
break;
+ prt_char(out, ' ');
bch2_extent_ptr_to_text(out, c, ptr);
if (s.csum_type < BCH_CSUM_NR &&
@@ -192,7 +199,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->stripe, s.k->p.offset,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
@@ -203,7 +210,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
} else {
@@ -213,7 +220,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bucket.inode, bucket.offset, a->gen,
a->stripe,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
@@ -223,7 +230,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bch2_data_type_str(a->data_type),
bch2_data_type_str(data_type),
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
@@ -235,7 +242,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
}
@@ -273,8 +280,8 @@ static int mark_stripe_bucket(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
if (unlikely(!ca)) {
- if (!(flags & BTREE_TRIGGER_overwrite))
- ret = -EIO;
+ if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
@@ -293,7 +300,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
ptr->dev,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err_unlock;
}
@@ -351,6 +358,19 @@ static int mark_stripe_buckets(struct btree_trans *trans,
return 0;
}
+static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
+{
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->disk_label = s->disk_label;
+ m->blocks_nonempty = 0;
+
+ for (unsigned i = 0; i < s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+}
+
int bch2_trigger_stripe(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new,
@@ -467,14 +487,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
memset(m, 0, sizeof(*m));
} else {
- m->sectors = le16_to_cpu(new_s->sectors);
- m->algorithm = new_s->algorithm;
- m->nr_blocks = new_s->nr_blocks;
- m->nr_redundant = new_s->nr_redundant;
- m->blocks_nonempty = 0;
-
- for (unsigned i = 0; i < new_s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
+ stripe_to_mem(m, new_s);
if (!old_s)
bch2_stripes_heap_insert(c, m, idx);
@@ -816,13 +829,15 @@ err:
}
/* recovery read path: */
-int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
+int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
+ struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
- struct ec_stripe_buf *buf;
+ struct ec_stripe_buf *buf = NULL;
struct closure cl;
struct bch_stripe *v;
unsigned i, offset;
+ const char *msg = NULL;
int ret = 0;
closure_init_stack(&cl);
@@ -835,32 +850,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: error %i looking up stripe", ret);
- kfree(buf);
- return -EIO;
+ msg = "stripe not found";
+ goto err;
}
v = &bkey_i_to_stripe(&buf->key)->v;
if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: pointer doesn't match stripe");
- ret = -EIO;
+ msg = "pointer doesn't match stripe";
goto err;
}
offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: read is bigger than stripe");
- ret = -EIO;
+ msg = "read is bigger than stripe";
goto err;
}
ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
- if (ret)
+ if (ret) {
+ msg = "-ENOMEM";
goto err;
+ }
for (i = 0; i < v->nr_blocks; i++)
ec_block_io(c, buf, REQ_OP_READ, i, &cl);
@@ -868,9 +879,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
closure_sync(&cl);
if (ec_nr_failed(buf) > v->nr_redundant) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: unable to read enough blocks");
- ret = -EIO;
+ msg = "unable to read enough blocks";
goto err;
}
@@ -882,20 +891,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
-err:
+out:
ec_stripe_buf_exit(buf);
kfree(buf);
return ret;
+err:
+ struct printbuf msgbuf = PRINTBUF;
+ bch2_bkey_val_to_text(&msgbuf, c, orig_k);
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
+ printbuf_exit(&msgbuf);;
+ ret = -BCH_ERR_stripe_reconstruct;
+ goto out;
}
/* stripe bucket accounting: */
-static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx)
{
ec_stripes_heap n, *h = &c->ec_stripes_heap;
if (idx >= h->size) {
- if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
+ if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), GFP_KERNEL))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
mutex_lock(&c->ec_stripes_heap_lock);
@@ -909,11 +926,11 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
free_heap(&n);
}
- if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
+ if (!genradix_ptr_alloc(&c->stripes, idx, GFP_KERNEL))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
if (c->gc_pos.phase != GC_PHASE_not_running &&
- !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
+ !genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
return 0;
@@ -923,7 +940,7 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans,
struct btree_iter *iter)
{
return allocate_dropping_locks_errcode(trans,
- __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
+ __ec_stripe_mem_alloc(trans->c, iter->pos.offset));
}
/*
@@ -1305,7 +1322,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
bkey_reassemble(n, k);
- bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
+ bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
BUG_ON(!ec_ptr);
@@ -1555,10 +1572,12 @@ void bch2_ec_do_stripe_creates(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
}
-static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
+static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
{
struct ec_stripe_new *s = h->s;
+ lockdep_assert_held(&h->lock);
+
BUG_ON(!s->allocated && !s->err);
h->s = NULL;
@@ -1571,6 +1590,12 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
ec_stripe_new_put(c, s, STRIPE_REF_io);
}
+static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
+{
+ h->s->err = err;
+ ec_stripe_new_set_pending(c, h);
+}
+
void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
{
struct ec_stripe_new *s = ob->ec;
@@ -1641,7 +1666,8 @@ static void ec_stripe_key_init(struct bch_fs *c,
struct bkey_i *k,
unsigned nr_data,
unsigned nr_parity,
- unsigned stripe_size)
+ unsigned stripe_size,
+ unsigned disk_label)
{
struct bkey_i_stripe *s = bkey_stripe_init(k);
unsigned u64s;
@@ -1652,7 +1678,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
s->v.nr_redundant = nr_parity;
s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9);
s->v.csum_type = BCH_CSUM_crc32c;
- s->v.pad = 0;
+ s->v.disk_label = disk_label;
while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
BUG_ON(1 << s->v.csum_granularity_bits >=
@@ -1685,40 +1711,30 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
s->nr_parity = h->redundancy;
ec_stripe_key_init(c, &s->new_stripe.key,
- s->nr_data, s->nr_parity, h->blocksize);
+ s->nr_data, s->nr_parity,
+ h->blocksize, h->disk_label);
h->s = s;
+ h->nr_created++;
return 0;
}
-static struct ec_stripe_head *
-ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
- unsigned algo, unsigned redundancy,
- enum bch_watermark watermark)
+static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
{
- struct ec_stripe_head *h;
-
- h = kzalloc(sizeof(*h), GFP_KERNEL);
- if (!h)
- return NULL;
-
- mutex_init(&h->lock);
- BUG_ON(!mutex_trylock(&h->lock));
-
- h->target = target;
- h->algo = algo;
- h->redundancy = redundancy;
- h->watermark = watermark;
-
rcu_read_lock();
- h->devs = target_rw_devs(c, BCH_DATA_user, target);
+ h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
+ ? group_to_target(h->disk_label - 1)
+ : 0);
+ unsigned nr_devs = dev_mask_nr(&h->devs);
for_each_member_device_rcu(c, ca, &h->devs)
if (!ca->mi.durability)
__clear_bit(ca->dev_idx, h->devs.d);
+ unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
h->blocksize = pick_blocksize(c, &h->devs);
+ h->nr_active_devs = 0;
for_each_member_device_rcu(c, ca, &h->devs)
if (ca->mi.bucket_size == h->blocksize)
h->nr_active_devs++;
@@ -1729,9 +1745,47 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
* If we only have redundancy + 1 devices, we're better off with just
* replication:
*/
- if (h->nr_active_devs < h->redundancy + 2)
- bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
- h->nr_active_devs, h->redundancy + 2);
+ h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
+
+ if (h->insufficient_devs) {
+ const char *err;
+
+ if (nr_devs < h->redundancy + 2)
+ err = NULL;
+ else if (nr_devs_with_durability < h->redundancy + 2)
+ err = "cannot use durability=0 devices";
+ else
+ err = "mismatched bucket sizes";
+
+ if (err)
+ bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
+ h->nr_active_devs, h->redundancy + 2, err);
+ }
+
+ if (h->s && !h->s->allocated)
+ ec_stripe_new_cancel(c, h, -EINTR);
+
+ h->rw_devs_change_count = c->rw_devs_change_count;
+}
+
+static struct ec_stripe_head *
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
+ unsigned algo, unsigned redundancy,
+ enum bch_watermark watermark)
+{
+ struct ec_stripe_head *h;
+
+ h = kzalloc(sizeof(*h), GFP_KERNEL);
+ if (!h)
+ return NULL;
+
+ mutex_init(&h->lock);
+ BUG_ON(!mutex_trylock(&h->lock));
+
+ h->disk_label = disk_label;
+ h->algo = algo;
+ h->redundancy = redundancy;
+ h->watermark = watermark;
list_add(&h->list, &c->ec_stripe_head_list);
return h;
@@ -1743,14 +1797,14 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
h->s->allocated &&
bitmap_weight(h->s->blocks_allocated,
h->s->nr_data) == h->s->nr_data)
- ec_stripe_set_pending(c, h);
+ ec_stripe_new_set_pending(c, h);
mutex_unlock(&h->lock);
}
static struct ec_stripe_head *
__bch2_ec_stripe_head_get(struct btree_trans *trans,
- unsigned target,
+ unsigned disk_label,
unsigned algo,
unsigned redundancy,
enum bch_watermark watermark)
@@ -1768,27 +1822,32 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
if (test_bit(BCH_FS_going_ro, &c->flags)) {
h = ERR_PTR(-BCH_ERR_erofs_no_writes);
- goto found;
+ goto err;
}
list_for_each_entry(h, &c->ec_stripe_head_list, list)
- if (h->target == target &&
+ if (h->disk_label == disk_label &&
h->algo == algo &&
h->redundancy == redundancy &&
h->watermark == watermark) {
ret = bch2_trans_mutex_lock(trans, &h->lock);
- if (ret)
+ if (ret) {
h = ERR_PTR(ret);
+ goto err;
+ }
goto found;
}
- h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
+ h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
found:
- if (!IS_ERR_OR_NULL(h) &&
- h->nr_active_devs < h->redundancy + 2) {
+ if (h->rw_devs_change_count != c->rw_devs_change_count)
+ ec_stripe_head_devs_update(c, h);
+
+ if (h->insufficient_devs) {
mutex_unlock(&h->lock);
h = NULL;
}
+err:
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
@@ -1796,38 +1855,39 @@ found:
static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
enum bch_watermark watermark, struct closure *cl)
{
+ struct ec_stripe_new *s = h->s;
struct bch_fs *c = trans->c;
struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
struct open_buckets buckets;
- struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
+ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
bool have_cache = true;
int ret = 0;
- BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity);
- BUG_ON(v->nr_redundant != h->s->nr_parity);
+ BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity);
+ BUG_ON(v->nr_redundant != s->nr_parity);
/* * We bypass the sector allocator which normally does this: */
bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
- for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
+ for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
__clear_bit(v->ptrs[i].dev, devs.d);
- if (i < h->s->nr_data)
+ if (i < s->nr_data)
nr_have_data++;
else
nr_have_parity++;
}
- BUG_ON(nr_have_data > h->s->nr_data);
- BUG_ON(nr_have_parity > h->s->nr_parity);
+ BUG_ON(nr_have_data > s->nr_data);
+ BUG_ON(nr_have_parity > s->nr_parity);
buckets.nr = 0;
- if (nr_have_parity < h->s->nr_parity) {
+ if (nr_have_parity < s->nr_parity) {
ret = bch2_bucket_alloc_set_trans(trans, &buckets,
&h->parity_stripe,
&devs,
- h->s->nr_parity,
+ s->nr_parity,
&nr_have_parity,
&have_cache, 0,
BCH_DATA_parity,
@@ -1835,14 +1895,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
cl);
open_bucket_for_each(c, &buckets, ob, i) {
- j = find_next_zero_bit(h->s->blocks_gotten,
- h->s->nr_data + h->s->nr_parity,
- h->s->nr_data);
- BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+ j = find_next_zero_bit(s->blocks_gotten,
+ s->nr_data + s->nr_parity,
+ s->nr_data);
+ BUG_ON(j >= s->nr_data + s->nr_parity);
- h->s->blocks[j] = buckets.v[i];
+ s->blocks[j] = buckets.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
- __set_bit(j, h->s->blocks_gotten);
+ __set_bit(j, s->blocks_gotten);
}
if (ret)
@@ -1850,11 +1910,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
}
buckets.nr = 0;
- if (nr_have_data < h->s->nr_data) {
+ if (nr_have_data < s->nr_data) {
ret = bch2_bucket_alloc_set_trans(trans, &buckets,
&h->block_stripe,
&devs,
- h->s->nr_data,
+ s->nr_data,
&nr_have_data,
&have_cache, 0,
BCH_DATA_user,
@@ -1862,13 +1922,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
cl);
open_bucket_for_each(c, &buckets, ob, i) {
- j = find_next_zero_bit(h->s->blocks_gotten,
- h->s->nr_data, 0);
- BUG_ON(j >= h->s->nr_data);
+ j = find_next_zero_bit(s->blocks_gotten,
+ s->nr_data, 0);
+ BUG_ON(j >= s->nr_data);
- h->s->blocks[j] = buckets.v[i];
+ s->blocks[j] = buckets.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
- __set_bit(j, h->s->blocks_gotten);
+ __set_bit(j, s->blocks_gotten);
}
if (ret)
@@ -1878,7 +1938,6 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
return 0;
}
-/* XXX: doesn't obey target: */
static s64 get_existing_stripe(struct bch_fs *c,
struct ec_stripe_head *head)
{
@@ -1901,7 +1960,8 @@ static s64 get_existing_stripe(struct bch_fs *c,
m = genradix_ptr(&c->stripes, stripe_idx);
- if (m->algorithm == head->algo &&
+ if (m->disk_label == head->disk_label &&
+ m->algorithm == head->algo &&
m->nr_redundant == head->redundancy &&
m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
@@ -1914,72 +1974,75 @@ static s64 get_existing_stripe(struct bch_fs *c,
return ret;
}
-static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
+static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
{
- struct bch_fs *c = trans->c;
- struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
- struct bch_stripe *existing_v;
+ struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+ struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
unsigned i;
- s64 idx;
- int ret;
-
- /*
- * If we can't allocate a new stripe, and there's no stripes with empty
- * blocks for us to reuse, that means we have to wait on copygc:
- */
- idx = get_existing_stripe(c, h);
- if (idx < 0)
- return -BCH_ERR_stripe_alloc_blocked;
- ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
- bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
- "reading stripe key: %s", bch2_err_str(ret));
- if (ret) {
- bch2_stripe_close(c, h->s);
- return ret;
- }
-
- existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
-
- BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
- h->s->nr_data = existing_v->nr_blocks -
+ BUG_ON(existing_v->nr_redundant != s->nr_parity);
+ s->nr_data = existing_v->nr_blocks -
existing_v->nr_redundant;
- ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
+ int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
if (ret) {
- bch2_stripe_close(c, h->s);
+ bch2_stripe_close(c, s);
return ret;
}
- BUG_ON(h->s->existing_stripe.size != h->blocksize);
- BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
+ BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
/*
* Free buckets we initially allocated - they might conflict with
* blocks from the stripe we're reusing:
*/
- for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
- bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
- h->s->blocks[i] = 0;
+ for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
+ bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
+ s->blocks[i] = 0;
}
- memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
- memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
+ memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
+ memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
for (i = 0; i < existing_v->nr_blocks; i++) {
if (stripe_blockcount_get(existing_v, i)) {
- __set_bit(i, h->s->blocks_gotten);
- __set_bit(i, h->s->blocks_allocated);
+ __set_bit(i, s->blocks_gotten);
+ __set_bit(i, s->blocks_allocated);
}
- ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+ ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
}
- bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
- h->s->have_existing_stripe = true;
+ bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
+ s->have_existing_stripe = true;
return 0;
}
+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
+{
+ struct bch_fs *c = trans->c;
+ s64 idx;
+ int ret;
+
+ /*
+ * If we can't allocate a new stripe, and there's no stripes with empty
+ * blocks for us to reuse, that means we have to wait on copygc:
+ */
+ idx = get_existing_stripe(c, h);
+ if (idx < 0)
+ return -BCH_ERR_stripe_alloc_blocked;
+
+ ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
+ bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
+ "reading stripe key: %s", bch2_err_str(ret));
+ if (ret) {
+ bch2_stripe_close(c, h->s);
+ return ret;
+ }
+
+ return init_new_stripe_from_existing(c, h->s);
+}
+
static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
{
struct bch_fs *c = trans->c;
@@ -2046,9 +2109,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct ec_stripe_head *h;
bool waiting = false;
+ unsigned disk_label = 0;
+ struct target t = target_decode(target);
int ret;
- h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
+ if (t.type == TARGET_GROUP) {
+ if (t.group > U8_MAX) {
+ bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
+ return NULL;
+ }
+ disk_label = t.group + 1; /* 0 == no label */
+ }
+
+ h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
if (IS_ERR_OR_NULL(h))
return h;
@@ -2126,6 +2199,79 @@ err:
return ERR_PTR(ret);
}
+/* device removal */
+
+static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
+{
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
+
+ if (!a->stripe)
+ return 0;
+
+ if (a->stripe_sectors) {
+ bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
+ return -BCH_ERR_invalidate_stripe_to_dev;
+ }
+
+ struct btree_iter iter;
+ struct bkey_i_stripe *s =
+ bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
+ BTREE_ITER_slots, stripe);
+ int ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ return ret;
+
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+
+ s64 sectors = 0;
+ for (unsigned i = 0; i < s->v.nr_blocks; i++)
+ sectors -= stripe_blockcount_get(&s->v, i);
+
+ bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+ acc.replicas.data_type = BCH_DATA_user;
+ ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+ if (ret)
+ goto err;
+
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->dev == k_a.k->p.inode) {
+ if (stripe_blockcount_get(&s->v, ptr - &ptrs.start->ptr)) {
+ bch_err(trans->c, "trying to invalidate device in stripe when stripe block not empty");
+ ret = -BCH_ERR_invalidate_stripe_to_dev;
+ goto err;
+ }
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+ }
+
+ sectors = -sectors;
+
+ bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+ acc.replicas.data_type = BCH_DATA_user;
+ ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_dev_remove_stripes(struct bch_fs *c, struct bch_dev *ca)
+{
+ return bch2_trans_run(c,
+ for_each_btree_key_upto_commit(trans, iter,
+ BTREE_ID_alloc, POS(ca->dev_idx, 0), POS(ca->dev_idx, U64_MAX),
+ BTREE_ITER_intent, k,
+ NULL, NULL, 0, ({
+ bch2_invalidate_stripe_to_dev(trans, k);
+ })));
+}
+
+/* startup/shutdown */
+
static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
{
struct ec_stripe_head *h;
@@ -2151,8 +2297,7 @@ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
}
goto unlock;
found:
- h->s->err = -BCH_ERR_erofs_no_writes;
- ec_stripe_set_pending(c, h);
+ ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
unlock:
mutex_unlock(&h->lock);
}
@@ -2193,21 +2338,13 @@ int bch2_stripes_read(struct bch_fs *c)
if (k.k->type != KEY_TYPE_stripe)
continue;
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset);
if (ret)
break;
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-
struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->blocks_nonempty = 0;
- for (unsigned i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
bch2_stripes_heap_insert(c, m, k.k->p.offset);
0;
@@ -2252,6 +2389,8 @@ static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
prt_printf(out, " %u", s->blocks[i]);
prt_newline(out);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
+ prt_newline(out);
}
void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
@@ -2261,9 +2400,10 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->ec_stripe_head_lock);
list_for_each_entry(h, &c->ec_stripe_head_list, list) {
- prt_printf(out, "target %u algo %u redundancy %u %s:\n",
- h->target, h->algo, h->redundancy,
- bch2_watermarks[h->watermark]);
+ prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
+ h->disk_label, h->algo, h->redundancy,
+ bch2_watermarks[h->watermark],
+ h->nr_created);
if (h->s)
bch2_new_stripe_to_text(out, c, h->s);
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
index 90962b3c..a2e2d79d 100644
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@@ -97,7 +97,9 @@ static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe
const struct bch_extent_ptr *data_ptr,
unsigned sectors)
{
- return data_ptr->dev == stripe_ptr->dev &&
+ return (data_ptr->dev == stripe_ptr->dev ||
+ data_ptr->dev == BCH_SB_MEMBER_INVALID ||
+ stripe_ptr->dev == BCH_SB_MEMBER_INVALID) &&
data_ptr->gen == stripe_ptr->gen &&
data_ptr->offset >= stripe_ptr->offset &&
data_ptr->offset < stripe_ptr->offset + sectors;
@@ -186,10 +188,15 @@ struct ec_stripe_head {
struct list_head list;
struct mutex lock;
- unsigned target;
+ unsigned disk_label;
unsigned algo;
unsigned redundancy;
enum bch_watermark watermark;
+ bool insufficient_devs;
+
+ unsigned long rw_devs_change_count;
+
+ u64 nr_created;
struct bch_devs_mask devs;
unsigned nr_active_devs;
@@ -202,7 +209,7 @@ struct ec_stripe_head {
struct ec_stripe_new *s;
};
-int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
+int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
@@ -247,6 +254,8 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
}
}
+int bch2_dev_remove_stripes(struct bch_fs *, struct bch_dev *);
+
void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_fs_ec_stop(struct bch_fs *);
void bch2_fs_ec_flush(struct bch_fs *);
diff --git a/libbcachefs/ec_format.h b/libbcachefs/ec_format.h
index 44ce88ba..64ef52e0 100644
--- a/libbcachefs/ec_format.h
+++ b/libbcachefs/ec_format.h
@@ -11,7 +11,14 @@ struct bch_stripe {
__u8 csum_granularity_bits;
__u8 csum_type;
- __u8 pad;
+
+ /*
+ * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2
+ *
+ * we can manage with this because this only needs to point to a
+ * disk label, not a target:
+ */
+ __u8 disk_label;
struct bch_extent_ptr ptrs[];
} __packed __aligned(8);
diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h
index 1df03dcc..8d1e70e8 100644
--- a/libbcachefs/ec_types.h
+++ b/libbcachefs/ec_types.h
@@ -16,6 +16,7 @@ struct stripe {
u8 nr_blocks;
u8 nr_redundant;
u8 blocks_nonempty;
+ u8 disk_label;
};
struct gc_stripe {
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index ab5a7ade..60b7875a 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -119,8 +119,8 @@
x(EEXIST, EEXIST_str_hash_set) \
x(EEXIST, EEXIST_discard_in_flight_add) \
x(EEXIST, EEXIST_subvolume_create) \
- x(0, open_buckets_empty) \
- x(0, freelist_empty) \
+ x(ENOSPC, open_buckets_empty) \
+ x(ENOSPC, freelist_empty) \
x(BCH_ERR_freelist_empty, no_buckets_found) \
x(0, transaction_restart) \
x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \
@@ -244,6 +244,16 @@
x(EIO, btree_node_read_error) \
x(EIO, btree_node_read_validate_error) \
x(EIO, btree_need_topology_repair) \
+ x(EIO, bucket_ref_update) \
+ x(EIO, trigger_pointer) \
+ x(EIO, trigger_stripe_pointer) \
+ x(EIO, metadata_bucket_inconsistency) \
+ x(EIO, mark_stripe) \
+ x(EIO, stripe_reconstruct) \
+ x(EIO, key_type_error) \
+ x(EIO, no_device_to_read_from) \
+ x(EIO, missing_indirect_extent) \
+ x(EIO, invalidate_stripe_to_dev) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
@@ -257,7 +267,6 @@
x(BCH_ERR_nopromote, nopromote_in_flight) \
x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem) \
- x(0, need_inode_lock) \
x(0, invalid_snapshot_node) \
x(0, option_needs_open_fs)
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index e317df36..5467d0f9 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -115,7 +115,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
int ret = 0;
if (k.k->type == KEY_TYPE_error)
- return -EIO;
+ return -BCH_ERR_key_type_error;
rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
@@ -133,7 +133,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
* read:
*/
if (!ret && !p.ptr.cached)
- ret = -EIO;
+ ret = -BCH_ERR_no_device_to_read_from;
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
@@ -146,16 +146,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
? f->idx
: f->idx + 1;
- if (!p.idx && !ca)
+ if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
p.idx++;
if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
p.idx++;
- if (!p.idx && !bch2_dev_is_readable(ca))
- p.idx++;
-
- if (p.idx >= (unsigned) p.has_ec + 1)
+ if (p.idx > (unsigned) p.has_ec)
continue;
if (ret > 0 && !ptr_better(c, p, *pick))
@@ -781,14 +778,17 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
/*
* Returns pointer to the next entry after the one being dropped:
*/
-union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry = to_entry(ptr), *next;
- union bch_extent_entry *ret = entry;
bool drop_crc = true;
+ if (k.k->type == KEY_TYPE_stripe) {
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+ return;
+ }
+
EBUG_ON(ptr < &ptrs.start->ptr ||
ptr >= &ptrs.end->ptr);
EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
@@ -811,21 +811,28 @@ union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
break;
if ((extent_entry_is_crc(entry) && drop_crc) ||
- extent_entry_is_stripe_ptr(entry)) {
- ret = (void *) ret - extent_entry_bytes(entry);
+ extent_entry_is_stripe_ptr(entry))
extent_entry_drop(k, entry);
- }
}
-
- return ret;
}
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
{
+ if (k.k->type != KEY_TYPE_stripe) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (p.ptr.dev == ptr->dev && p.has_ec) {
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+ return;
+ }
+ }
+
bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
- union bch_extent_entry *ret =
- bch2_bkey_drop_ptr_noerror(k, ptr);
+
+ bch2_bkey_drop_ptr_noerror(k, ptr);
/*
* If we deleted all the dirty pointers and there's still cached
@@ -837,14 +844,10 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
!bch2_bkey_dirty_devs(k.s_c).nr) {
k.k->type = KEY_TYPE_error;
set_bkey_val_u64s(k.k, 0);
- ret = NULL;
} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
k.k->type = KEY_TYPE_deleted;
set_bkey_val_u64s(k.k, 0);
- ret = NULL;
}
-
- return ret;
}
void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
@@ -854,10 +857,7 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
{
- struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
-
- if (ptr)
- bch2_bkey_drop_ptr_noerror(k, ptr);
+ bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
}
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
@@ -929,8 +929,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
if (p1.ptr.dev == p2.ptr.dev &&
p1.ptr.gen == p2.ptr.gen &&
+
+ /*
+ * This checks that the two pointers point
+ * to the same region on disk - adjusting
+ * for the difference in where the extents
+ * start, since one may have been trimmed:
+ */
(s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
- (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
+
+ /*
+ * This additionally checks that the
+ * extents overlap on disk, since the
+ * previous check may trigger spuriously
+ * when one extent is immediately partially
+ * overwritten with another extent (so that
+ * on disk they are adjacent) and
+ * compression is in use:
+ */
+ ((p1.ptr.offset >= p2.ptr.offset &&
+ p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
+ (p2.ptr.offset >= p1.ptr.offset &&
+ p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
return true;
return false;
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 8e1ba46f..ed5001dd 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -611,9 +611,6 @@ unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_d
unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
@@ -649,26 +646,38 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
-union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
- struct bch_extent_ptr *);
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
- struct bch_extent_ptr *);
+void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *);
+void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
-#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+
+#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \
do { \
- struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \
+ __label__ _again; \
+ struct bkey_ptrs _ptrs; \
+_again: \
+ _ptrs = bch2_bkey_ptrs(_k); \
\
- struct bch_extent_ptr *_ptr = &_ptrs.start->ptr; \
- \
- while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \
+ bkey_for_each_ptr(_ptrs, _ptr) \
if (_cond) { \
- _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \
- _ptrs = bch2_bkey_ptrs(_k); \
- continue; \
+ bch2_bkey_drop_ptr_noerror(_k, _ptr); \
+ goto _again; \
} \
+} while (0)
+
+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
+do { \
+ __label__ _again; \
+ struct bkey_ptrs _ptrs; \
+_again: \
+ _ptrs = bch2_bkey_ptrs(_k); \
\
- (_ptr)++; \
- } \
+ bkey_for_each_ptr(_ptrs, _ptr) \
+ if (_cond) { \
+ bch2_bkey_drop_ptr(_k, _ptr); \
+ goto _again; \
+ } \
} while (0)
bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c
index 508d029a..7e10a9dd 100644
--- a/libbcachefs/fs-common.c
+++ b/libbcachefs/fs-common.c
@@ -42,7 +42,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir,
+ BTREE_ITER_intent|BTREE_ITER_with_updates);
if (ret)
goto err;
@@ -163,7 +164,7 @@ int bch2_create_trans(struct btree_trans *trans,
name,
dir_target,
&dir_offset,
- STR_HASH_must_create);
+ STR_HASH_must_create|BTREE_ITER_with_updates);
if (ret)
goto err;
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
index f5cff824..99fef934 100644
--- a/libbcachefs/fs-io-buffered.c
+++ b/libbcachefs/fs-io-buffered.c
@@ -791,8 +791,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
static int __bch2_buffered_write(struct bch_inode_info *inode,
struct address_space *mapping,
struct iov_iter *iter,
- loff_t pos, unsigned len,
- bool inode_locked)
+ loff_t pos, unsigned len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
@@ -816,15 +815,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
BUG_ON(!fs.nr);
- /*
- * If we're not using the inode lock, we need to lock all the folios for
- * atomiticity of writes vs. other writes:
- */
- if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
- ret = -BCH_ERR_need_inode_lock;
- goto out;
- }
-
f = darray_first(fs);
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
ret = bch2_read_single_folio(f, mapping);
@@ -921,10 +911,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
end = pos + copied;
spin_lock(&inode->v.i_lock);
- if (end > inode->v.i_size) {
- BUG_ON(!inode_locked);
+ if (end > inode->v.i_size)
i_size_write(&inode->v, end);
- }
spin_unlock(&inode->v.i_lock);
f_pos = pos;
@@ -968,68 +956,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
- loff_t pos;
- bool inode_locked = false;
- ssize_t written = 0, written2 = 0, ret = 0;
-
- /*
- * We don't take the inode lock unless i_size will be changing. Folio
- * locks provide exclusion with other writes, and the pagecache add lock
- * provides exclusion with truncate and hole punching.
- *
- * There is one nasty corner case where atomicity would be broken
- * without great care: when copying data from userspace to the page
- * cache, we do that with faults disable - a page fault would recurse
- * back into the filesystem, taking filesystem locks again, and
- * deadlock; so it's done with faults disabled, and we fault in the user
- * buffer when we aren't holding locks.
- *
- * If we do part of the write, but we then race and in the userspace
- * buffer have been evicted and are no longer resident, then we have to
- * drop our folio locks to re-fault them in, breaking write atomicity.
- *
- * To fix this, we restart the write from the start, if we weren't
- * holding the inode lock.
- *
- * There is another wrinkle after that; if we restart the write from the
- * start, and then get an unrecoverable error, we _cannot_ claim to
- * userspace that we did not write data we actually did - so we must
- * track (written2) the most we ever wrote.
- */
-
- if ((iocb->ki_flags & IOCB_APPEND) ||
- (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
- inode_lock(&inode->v);
- inode_locked = true;
- }
-
- ret = generic_write_checks(iocb, iter);
- if (ret <= 0)
- goto unlock;
-
- ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
- if (ret) {
- if (!inode_locked) {
- inode_lock(&inode->v);
- inode_locked = true;
- ret = file_remove_privs_flags(file, 0);
- }
- if (ret)
- goto unlock;
- }
-
- ret = file_update_time(file);
- if (ret)
- goto unlock;
-
- pos = iocb->ki_pos;
+ loff_t pos = iocb->ki_pos;
+ ssize_t written = 0;
+ int ret = 0;
bch2_pagecache_add_get(inode);
- if (!inode_locked &&
- (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
- goto get_inode_lock;
-
do {
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned bytes = iov_iter_count(iter);
@@ -1054,17 +986,12 @@ again:
}
}
- if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
- goto get_inode_lock;
-
if (unlikely(fatal_signal_pending(current))) {
ret = -EINTR;
break;
}
- ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
- if (ret == -BCH_ERR_need_inode_lock)
- goto get_inode_lock;
+ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
if (unlikely(ret < 0))
break;
@@ -1085,46 +1012,50 @@ again:
}
pos += ret;
written += ret;
- written2 = max(written, written2);
-
- if (ret != bytes && !inode_locked)
- goto get_inode_lock;
ret = 0;
balance_dirty_pages_ratelimited(mapping);
-
- if (0) {
-get_inode_lock:
- bch2_pagecache_add_put(inode);
- inode_lock(&inode->v);
- inode_locked = true;
- bch2_pagecache_add_get(inode);
-
- iov_iter_revert(iter, written);
- pos -= written;
- written = 0;
- ret = 0;
- }
} while (iov_iter_count(iter));
- bch2_pagecache_add_put(inode);
-unlock:
- if (inode_locked)
- inode_unlock(&inode->v);
- iocb->ki_pos += written;
+ bch2_pagecache_add_put(inode);
- ret = max(written, written2) ?: ret;
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
- return ret;
+ return written ? written : ret;
}
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- ssize_t ret = iocb->ki_flags & IOCB_DIRECT
- ? bch2_direct_write(iocb, iter)
- : bch2_buffered_write(iocb, iter);
+ struct file *file = iocb->ki_filp;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = bch2_direct_write(iocb, from);
+ goto out;
+ }
+
+ inode_lock(&inode->v);
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto unlock;
+
+ ret = file_remove_privs(file);
+ if (ret)
+ goto unlock;
+
+ ret = file_update_time(file);
+ if (ret)
+ goto unlock;
+
+ ret = bch2_buffered_write(iocb, from);
+ if (likely(ret > 0))
+ iocb->ki_pos += ret;
+unlock:
+ inode_unlock(&inode->v);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+out:
return bch2_err_class(ret);
}
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 7a9c164c..12c1873f 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -273,14 +273,6 @@ retry:
}
}
-#define memalloc_flags_do(_flags, _do) \
-({ \
- unsigned _saved_flags = memalloc_flags_save(_flags); \
- typeof(_do) _ret = _do; \
- memalloc_noreclaim_restore(_saved_flags); \
- _ret; \
-})
-
static struct inode *bch2_alloc_inode(struct super_block *sb)
{
BUG();
@@ -380,6 +372,8 @@ __bch2_create(struct mnt_idmap *idmap,
subvol_inum inum;
struct bch_subvolume subvol;
u64 journal_seq = 0;
+ kuid_t kuid;
+ kgid_t kgid;
int ret;
/*
@@ -406,13 +400,15 @@ __bch2_create(struct mnt_idmap *idmap,
retry:
bch2_trans_begin(trans);
+ kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
+ kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
bch2_create_trans(trans,
inode_inum(dir), &dir_u, &inode_u,
!(flags & BCH_CREATE_TMPFILE)
? &dentry->d_name : NULL,
- from_kuid(i_user_ns(&dir->v), current_fsuid()),
- from_kgid(i_user_ns(&dir->v), current_fsgid()),
+ from_kuid(i_user_ns(&dir->v), kuid),
+ from_kgid(i_user_ns(&dir->v), kgid),
mode, rdev,
default_acl, acl, snapshot_src, flags) ?:
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
@@ -727,15 +723,16 @@ static int bch2_rename2(struct mnt_idmap *idmap,
struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
struct bch_inode_unpacked dst_dir_u, src_dir_u;
- struct bch_inode_unpacked src_inode_u, dst_inode_u;
+ struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
struct btree_trans *trans;
enum bch_rename_mode mode = flags & RENAME_EXCHANGE
? BCH_RENAME_EXCHANGE
: dst_dentry->d_inode
? BCH_RENAME_OVERWRITE : BCH_RENAME;
+ bool whiteout = !!(flags & RENAME_WHITEOUT);
int ret;
- if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
return -EINVAL;
if (mode == BCH_RENAME_OVERWRITE) {
@@ -776,18 +773,48 @@ static int bch2_rename2(struct mnt_idmap *idmap,
if (ret)
goto err;
}
+retry:
+ bch2_trans_begin(trans);
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_rename_trans(trans,
- inode_inum(src_dir), &src_dir_u,
- inode_inum(dst_dir), &dst_dir_u,
- &src_inode_u,
- &dst_inode_u,
- &src_dentry->d_name,
- &dst_dentry->d_name,
- mode));
+ ret = bch2_rename_trans(trans,
+ inode_inum(src_dir), &src_dir_u,
+ inode_inum(dst_dir), &dst_dir_u,
+ &src_inode_u,
+ &dst_inode_u,
+ &src_dentry->d_name,
+ &dst_dentry->d_name,
+ mode);
if (unlikely(ret))
+ goto err_tx_restart;
+
+ if (whiteout) {
+ whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
+ ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
+ if (unlikely(ret))
+ goto err_tx_restart;
+ bch2_inode_init_early(c, whiteout_inode_u);
+
+ ret = bch2_create_trans(trans,
+ inode_inum(src_dir), &src_dir_u,
+ whiteout_inode_u,
+ &src_dentry->d_name,
+ from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
+ from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
+ S_IFCHR|WHITEOUT_MODE, 0,
+ NULL, NULL, (subvol_inum) { 0 }, 0) ?:
+ bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
+ KEY_TYPE_QUOTA_PREALLOC);
+ if (unlikely(ret))
+ goto err_tx_restart;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0);
+ if (unlikely(ret)) {
+err_tx_restart:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
goto err;
+ }
BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
BUG_ON(dst_inode &&
@@ -835,11 +862,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
unsigned int ia_valid = attr->ia_valid;
+ kuid_t kuid;
+ kgid_t kgid;
- if (ia_valid & ATTR_UID)
- bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
- if (ia_valid & ATTR_GID)
- bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+ if (ia_valid & ATTR_UID) {
+ kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+ bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
+ }
+ if (ia_valid & ATTR_GID) {
+ kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+ bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
+ }
if (ia_valid & ATTR_SIZE)
bi->bi_size = attr->ia_size;
@@ -854,11 +887,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
kgid_t gid = ia_valid & ATTR_GID
- ? attr->ia_gid
+ ? kgid
: inode->v.i_gid;
- if (!in_group_p(gid) &&
- !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
+ if (!in_group_or_capable(idmap, &inode->v,
+ make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
mode &= ~S_ISGID;
bi->bi_mode = mode;
}
@@ -874,17 +907,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
+ kuid_t kuid;
+ kgid_t kgid;
int ret;
mutex_lock(&inode->ei_update_lock);
qid = inode->ei_qid;
- if (attr->ia_valid & ATTR_UID)
- qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+ if (attr->ia_valid & ATTR_UID) {
+ kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+ qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
+ }
- if (attr->ia_valid & ATTR_GID)
- qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+ if (attr->ia_valid & ATTR_GID) {
+ kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+ qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
+ }
ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
KEY_TYPE_QUOTA_PREALLOC);
@@ -940,13 +979,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
{
struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
stat->dev = inode->v.i_sb->s_dev;
stat->ino = inode->v.i_ino;
stat->mode = inode->v.i_mode;
stat->nlink = inode->v.i_nlink;
- stat->uid = inode->v.i_uid;
- stat->gid = inode->v.i_gid;
+ stat->uid = vfsuid_into_kuid(vfsuid);
+ stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->v.i_rdev;
stat->size = i_size_read(&inode->v);
stat->atime = inode_get_atime(&inode->v);
@@ -1865,30 +1906,13 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
static int bch2_show_options(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
- enum bch_opt_id i;
struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- for (i = 0; i < bch2_opts_nr; i++) {
- const struct bch_option *opt = &bch2_opt_table[i];
- u64 v = bch2_opt_get_by_id(&c->opts, i);
- if ((opt->flags & OPT_HIDDEN) ||
- !(opt->flags & OPT_MOUNT))
- continue;
+ bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
+ OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
+ seq_puts(seq, buf.buf);
- if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
- continue;
-
- printbuf_reset(&buf);
- bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
- OPT_SHOW_MOUNT_STYLE);
- seq_putc(seq, ',');
- seq_puts(seq, buf.buf);
- }
-
- if (buf.allocation_failure)
- ret = -ENOMEM;
+ int ret = buf.allocation_failure ? -ENOMEM : 0;
printbuf_exit(&buf);
return ret;
}
@@ -2209,7 +2233,7 @@ static struct file_system_type bcache_fs_type = {
.name = "bcachefs",
.init_fs_context = bch2_init_fs_context,
.kill_sb = bch2_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("bcachefs");
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index ce27ba1f..b2f50e74 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -777,7 +777,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
orig_k->k->k.size,
reflink_offset);
bch2_inconsistent_error(trans->c);
- ret = -EIO;
+ ret = -BCH_ERR_missing_indirect_extent;
goto err;
}
@@ -869,9 +869,15 @@ retry_pick:
goto hole;
if (pick_ret < 0) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+
bch_err_inum_offset_ratelimited(c,
read_pos.inode, read_pos.offset << 9,
- "no device to read from");
+ "no device to read from: %s\n %s",
+ bch2_err_str(pick_ret),
+ buf.buf);
+ printbuf_exit(&buf);
goto err;
}
@@ -1086,7 +1092,7 @@ get_bio:
trans->notrace_relock_fail = true;
} else {
/* Attempting reconstruct read: */
- if (bch2_ec_read_extent(trans, rbio)) {
+ if (bch2_ec_read_extent(trans, rbio, k)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index 1d4761d1..d3b5be7f 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -1447,9 +1447,7 @@ again:
op->nr_replicas_required,
op->watermark,
op->flags,
- (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
- BCH_WRITE_ONLY_SPECIFIED_DEVS))
- ? NULL : &op->cl, &wp));
+ &op->cl, &wp));
if (unlikely(ret)) {
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
break;
@@ -1592,6 +1590,9 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
+ if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+ op->flags |= BCH_WRITE_ALLOC_NOWAIT;
+
op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 32b886fe..30460bce 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1353,6 +1353,7 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
struct bch_replicas_padded replicas = {
.e.data_type = BCH_DATA_journal,
+ .e.nr_devs = 0,
.e.nr_required = 1,
};
@@ -1379,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c,
goto err;
darray_for_each(i->ptrs, ptr)
- replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
+ replicas_entry_add_dev(&replicas.e, ptr->dev);
bch2_replicas_entry_sort(&replicas.e);
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 70b998d9..ace291f1 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j)
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct btree_cache *bc = &c->btree_cache;
bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush;
size_t min_nr, min_key_cache, nr_flushed;
@@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
- if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
+ size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
+ if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
min_nr = 1;
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
@@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
- atomic_read(&c->btree_cache.dirty),
- c->btree_cache.used,
+ atomic_long_read(&bc->nr_dirty), btree_cache_live,
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index 0770aebe..232be8a4 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -432,6 +432,9 @@ void bch2_opt_to_text(struct printbuf *out,
else
prt_str(out, opt->choices[v]);
break;
+ case BCH_OPT_BITFIELD:
+ prt_bitflags(out, opt->choices, v);
+ break;
case BCH_OPT_FN:
opt->fn.to_text(out, c, sb, v);
break;
@@ -440,6 +443,32 @@ void bch2_opt_to_text(struct printbuf *out,
}
}
+void bch2_opts_to_text(struct printbuf *out,
+ struct bch_opts opts,
+ struct bch_fs *c, struct bch_sb *sb,
+ unsigned show_mask, unsigned hide_mask,
+ unsigned flags)
+{
+ bool first = true;
+
+ for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) {
+ const struct bch_option *opt = &bch2_opt_table[i];
+
+ if ((opt->flags & hide_mask) || !(opt->flags & show_mask))
+ continue;
+
+ u64 v = bch2_opt_get_by_id(&opts, i);
+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+ continue;
+
+ if (!first)
+ prt_char(out, ',');
+ first = false;
+
+ bch2_opt_to_text(out, c, sb, opt, v, flags);
+ }
+}
+
int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
{
int ret = 0;
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 3d83bcdc..cb2e244a 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -373,6 +373,16 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Exit recovery immediately prior to journal replay")\
+ x(recovery_passes, u64, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BITFIELD(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Recovery passes to run explicitly") \
+ x(recovery_passes_exclude, u64, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BITFIELD(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Recovery passes to exclude") \
x(recovery_pass_last, u8, \
OPT_FS|OPT_MOUNT, \
OPT_STR_NOLIMIT(bch2_recovery_passes), \
@@ -595,6 +605,10 @@ int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
const struct bch_option *, u64, unsigned);
+void bch2_opts_to_text(struct printbuf *,
+ struct bch_opts,
+ struct bch_fs *, struct bch_sb *,
+ unsigned, unsigned, unsigned);
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
int bch2_opts_check_may_set(struct bch_fs *);
diff --git a/libbcachefs/rcu_pending.c b/libbcachefs/rcu_pending.c
index 19a64660..40a20192 100644
--- a/libbcachefs/rcu_pending.c
+++ b/libbcachefs/rcu_pending.c
@@ -219,9 +219,9 @@ static noinline void __process_finished_items(struct rcu_pending *pending,
BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
- kvfree(ptr);
-
bool free_head = ((unsigned long) obj->func) & 1UL;
+
+ kvfree(ptr);
if (free_head)
kfree(obj);
}
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index cf81e512..2d299a37 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -13,6 +13,7 @@
#include "errcode.h"
#include "error.h"
#include "inode.h"
+#include "io_write.h"
#include "move.h"
#include "rebalance.h"
#include "subvolume.h"
@@ -156,6 +157,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
data_opts->rewrite_ptrs =
bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
data_opts->target = r->target;
+ data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
if (!data_opts->rewrite_ptrs) {
/*
@@ -263,6 +265,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
data_opts->target = target;
+ data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
return data_opts->rewrite_ptrs != 0;
}
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 36de1c6f..be1e7ca4 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -97,7 +97,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
@@ -525,17 +525,17 @@ static int read_btree_roots(struct bch_fs *c)
"error reading btree root %s l=%u: %s",
bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
if (btree_id_is_alloc(i)) {
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
r->error = 0;
- } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
+ } else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
bch_info(c, "will run btree node scan");
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
}
ret = 0;
@@ -706,14 +706,14 @@ int bch2_fs_recovery(struct bch_fs *c)
if (check_version_upgrade(c))
write_sb = true;
- c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
if (write_sb)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
if (c->opts.fsck)
set_bit(BCH_FS_fsck_running, &c->flags);
diff --git a/libbcachefs/recovery_passes.c b/libbcachefs/recovery_passes.c
index 73339a0a..735b8adc 100644
--- a/libbcachefs/recovery_passes.c
+++ b/libbcachefs/recovery_passes.c
@@ -40,7 +40,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
set_bit(BCH_FS_may_go_rw, &c->flags);
- if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit)
+ if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
return bch2_fs_read_write_early(c);
return 0;
}
@@ -97,14 +97,14 @@ u64 bch2_recovery_passes_from_stable(u64 v)
int bch2_run_explicit_recovery_pass(struct bch_fs *c,
enum bch_recovery_pass pass)
{
- if (c->recovery_passes_explicit & BIT_ULL(pass))
+ if (c->opts.recovery_passes & BIT_ULL(pass))
return 0;
bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
bch2_recovery_passes[pass], pass,
bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
- c->recovery_passes_explicit |= BIT_ULL(pass);
+ c->opts.recovery_passes |= BIT_ULL(pass);
if (c->curr_recovery_pass >= pass) {
c->curr_recovery_pass = pass;
@@ -161,7 +161,9 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa
{
struct recovery_pass_fn *p = recovery_pass_fns + pass;
- if (c->recovery_passes_explicit & BIT_ULL(pass))
+ if (c->opts.recovery_passes_exclude & BIT_ULL(pass))
+ return false;
+ if (c->opts.recovery_passes & BIT_ULL(pass))
return true;
if ((p->when & PASS_FSCK) && c->opts.fsck)
return true;
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index 12b1d28b..5ceda186 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -82,7 +82,8 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
}
for (unsigned i = 0; i < r->nr_devs; i++)
- if (!bch2_member_exists(sb, r->devs[i])) {
+ if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
+ !bch2_member_exists(sb, r->devs[i])) {
prt_printf(err, "invalid device %u in entry ", r->devs[i]);
goto bad;
}
@@ -122,7 +123,7 @@ static void extent_to_replicas(struct bkey_s_c k,
continue;
if (!p.has_ec)
- r->devs[r->nr_devs++] = p.ptr.dev;
+ replicas_entry_add_dev(r, p.ptr.dev);
else
r->nr_required = 0;
}
@@ -139,7 +140,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
for (ptr = s.v->ptrs;
ptr < s.v->ptrs + s.v->nr_blocks;
ptr++)
- r->devs[r->nr_devs++] = ptr->dev;
+ replicas_entry_add_dev(r, ptr->dev);
}
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
@@ -180,7 +181,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
e->nr_required = 1;
darray_for_each(devs, i)
- e->devs[e->nr_devs++] = *i;
+ replicas_entry_add_dev(e, *i);
bch2_replicas_entry_sort(e);
}
@@ -795,11 +796,11 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
nr_online += test_bit(e->devs[i], devs.d);
struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]);
- nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed;
+ nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
}
rcu_read_unlock();
- if (nr_failed == e->nr_devs)
+ if (nr_online + nr_failed == e->nr_devs)
continue;
if (nr_online < e->nr_required)
diff --git a/libbcachefs/replicas_format.h b/libbcachefs/replicas_format.h
index b9720819..b7eff904 100644
--- a/libbcachefs/replicas_format.h
+++ b/libbcachefs/replicas_format.h
@@ -5,7 +5,7 @@
struct bch_replicas_entry_v0 {
__u8 data_type;
__u8 nr_devs;
- __u8 devs[];
+ __u8 devs[] __counted_by(nr_devs);
} __packed;
struct bch_sb_field_replicas_v0 {
@@ -17,7 +17,7 @@ struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
- __u8 devs[];
+ __u8 devs[] __counted_by(nr_devs);
} __packed;
struct bch_sb_field_replicas {
@@ -28,4 +28,9 @@ struct bch_sb_field_replicas {
#define replicas_entry_bytes(_i) \
(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+#define replicas_entry_add_dev(e, d) ({ \
+ (e)->nr_devs++; \
+ (e)->devs[(e)->nr_devs - 1] = (d); \
+})
+
#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h
index 31760201..f0c14702 100644
--- a/libbcachefs/sb-errors_format.h
+++ b/libbcachefs/sb-errors_format.h
@@ -288,10 +288,10 @@ enum bch_fsck_flags {
x(invalid_btree_id, 274, 0) \
x(alloc_key_io_time_bad, 275, 0) \
x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
- x(accounting_key_junk_at_end, 277, 0) \
- x(accounting_key_replicas_nr_devs_0, 278, 0) \
- x(accounting_key_replicas_nr_required_bad, 279, 0) \
- x(accounting_key_replicas_devs_unsorted, 280, 0) \
+ x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \
+ x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
+ x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
+ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c
index b4ea6490..02bcde3c 100644
--- a/libbcachefs/sb-members.c
+++ b/libbcachefs/sb-members.c
@@ -11,7 +11,8 @@
void bch2_dev_missing(struct bch_fs *c, unsigned dev)
{
- bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
+ if (dev != BCH_SB_MEMBER_INVALID)
+ bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
}
void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket)
@@ -473,3 +474,51 @@ unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
nr += bch2_member_exists((struct bch_sb *) sb, i);
return nr;
}
+
+int bch2_sb_member_alloc(struct bch_fs *c)
+{
+ unsigned dev_idx = c->sb.nr_devices;
+ struct bch_sb_field_members_v2 *mi;
+ unsigned nr_devices;
+ unsigned u64s;
+ int best = -1;
+ u64 best_last_mount = 0;
+
+ if (dev_idx < BCH_SB_MEMBERS_MAX)
+ goto have_slot;
+
+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
+ /* eventually BCH_SB_MEMBERS_MAX will be raised */
+ if (dev_idx == BCH_SB_MEMBER_INVALID)
+ continue;
+
+ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
+ if (bch2_member_alive(&m))
+ continue;
+
+ u64 last_mount = le64_to_cpu(m.last_mount);
+ if (best < 0 || last_mount < best_last_mount) {
+ best = dev_idx;
+ best_last_mount = last_mount;
+ }
+ }
+ if (best >= 0) {
+ dev_idx = best;
+ goto have_slot;
+ }
+
+ return -BCH_ERR_ENOSPC_sb_members;
+have_slot:
+ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+
+ mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
+ le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+
+ mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+ if (!mi)
+ return -BCH_ERR_ENOSPC_sb_members;
+
+ c->disk_sb.sb->nr_devices = nr_devices;
+ return dev_idx;
+}
diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h
index f307f285..762083b5 100644
--- a/libbcachefs/sb-members.h
+++ b/libbcachefs/sb-members.h
@@ -198,29 +198,37 @@ static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
lockdep_is_held(&c->state_lock));
}
-static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
+static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev)
{
return c && dev < c->sb.nr_devices
? rcu_dereference(c->devs[dev])
: NULL;
}
+void bch2_dev_missing(struct bch_fs *, unsigned);
+
+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
+{
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
+ if (unlikely(!ca))
+ bch2_dev_missing(c, dev);
+ return ca;
+}
+
static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
{
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
if (ca)
bch2_dev_get(ca);
rcu_read_unlock();
return ca;
}
-void bch2_dev_missing(struct bch_fs *, unsigned);
-
static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
{
struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
- if (!ca)
+ if (unlikely(!ca))
bch2_dev_missing(c, dev);
return ca;
}
@@ -354,4 +362,6 @@ static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64
bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
+int bch2_sb_member_alloc(struct bch_fs *);
+
#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/libbcachefs/sb-members_format.h b/libbcachefs/sb-members_format.h
index e2630548..d727d2df 100644
--- a/libbcachefs/sb-members_format.h
+++ b/libbcachefs/sb-members_format.h
@@ -8,6 +8,11 @@
*/
#define BCH_SB_MEMBERS_MAX 64
+/*
+ * Sentinal value - indicates a device that does not exist
+ */
+#define BCH_SB_MEMBER_INVALID 255
+
#define BCH_MIN_NR_NBUCKETS (1 << 6)
#define BCH_IOPS_MEASUREMENTS() \
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index c8c266cb..215eed4c 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -270,7 +270,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
POS(insert->k.p.inode, U64_MAX),
- BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
+ BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) {
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index d86d5dae..77597fd7 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -524,7 +524,7 @@ static void bch2_sb_update(struct bch_fs *c)
c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
/* XXX this is wrong, we need a 96 or 128 bit integer type */
- c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo),
+ c->sb.time_base_lo = div64_u64(le64_to_cpu(src->time_base_lo),
c->sb.nsec_per_time_unit);
c->sb.time_base_hi = le32_to_cpu(src->time_base_hi);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index d8adf465..873e4be7 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -370,7 +370,7 @@ void bch2_fs_read_only(struct bch_fs *c)
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
- BUG_ON(atomic_read(&c->btree_cache.dirty));
+ BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
BUG_ON(c->btree_write_buffer.inc.keys.nr);
BUG_ON(c->btree_write_buffer.flushing.keys.nr);
@@ -1592,33 +1592,6 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
/* Device add/removal: */
-static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bpos start = POS(ca->dev_idx, 0);
- struct bpos end = POS(ca->dev_idx, U64_MAX);
- int ret;
-
- /*
- * We clear the LRU and need_discard btrees first so that we don't race
- * with bch2_do_invalidates() and bch2_do_discards()
- */
- ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_dev_usage_remove(c, ca->dev_idx);
- bch_err_msg(c, ret, "removing dev alloc info");
- return ret;
-}
-
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_member *m;
@@ -1730,9 +1703,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb;
struct bch_dev *ca = NULL;
- struct bch_sb_field_members_v2 *mi;
- struct bch_member dev_mi;
- unsigned dev_idx, nr_devices, u64s;
struct printbuf errbuf = PRINTBUF;
struct printbuf label = PRINTBUF;
int ret;
@@ -1742,7 +1712,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (ret)
goto err;
- dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
+ struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
if (BCH_MEMBER_GROUP(&dev_mi)) {
bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
@@ -1780,55 +1750,19 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_unlock;
if (dynamic_fault("bcachefs:add:no_slot"))
- goto no_slot;
-
- if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) {
- dev_idx = c->sb.nr_devices;
- goto have_slot;
- }
-
- int best = -1;
- u64 best_last_mount = 0;
- for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
- struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
- if (bch2_member_alive(&m))
- continue;
-
- u64 last_mount = le64_to_cpu(m.last_mount);
- if (best < 0 || last_mount < best_last_mount) {
- best = dev_idx;
- best_last_mount = last_mount;
- }
- }
- if (best >= 0) {
- dev_idx = best;
- goto have_slot;
- }
-no_slot:
- ret = -BCH_ERR_ENOSPC_sb_members;
- bch_err_msg(c, ret, "setting up new superblock");
- goto err_unlock;
-
-have_slot:
- nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
-
- mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
- le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+ goto err_unlock;
- mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
- if (!mi) {
- ret = -BCH_ERR_ENOSPC_sb_members;
+ ret = bch2_sb_member_alloc(c);
+ if (ret < 0) {
bch_err_msg(c, ret, "setting up new superblock");
goto err_unlock;
}
- struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+ unsigned dev_idx = ret;
/* success: */
- *m = dev_mi;
- m->last_mount = cpu_to_le64(ktime_get_real_seconds());
- c->disk_sb.sb->nr_devices = nr_devices;
+ dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());
+ *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 4a373581..03e59f86 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -244,14 +244,18 @@ static struct attribute sysfs_state_rw = {
static size_t bch2_btree_cache_size(struct bch_fs *c)
{
+ struct btree_cache *bc = &c->btree_cache;
size_t ret = 0;
struct btree *b;
- mutex_lock(&c->btree_cache.lock);
- list_for_each_entry(b, &c->btree_cache.live, list)
+ mutex_lock(&bc->lock);
+ list_for_each_entry(b, &bc->live[0].list, list)
ret += btree_buf_bytes(b);
-
- mutex_unlock(&c->btree_cache.lock);
+ list_for_each_entry(b, &bc->live[1].list, list)
+ ret += btree_buf_bytes(b);
+ list_for_each_entry(b, &bc->freeable, list)
+ ret += btree_buf_bytes(b);
+ mutex_unlock(&bc->lock);
return ret;
}
@@ -287,7 +291,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
prt_tab_rjust(out);
prt_human_readable_u64(out, nr_extents
- ? div_u64(sectors_uncompressed << 9, nr_extents)
+ ? div64_u64(sectors_uncompressed << 9, nr_extents)
: 0);
prt_tab_rjust(out);
prt_newline(out);
@@ -444,11 +448,12 @@ STORE(bch2_fs)
return -EROFS;
if (attr == &sysfs_trigger_btree_cache_shrink) {
+ struct btree_cache *bc = &c->btree_cache;
struct shrink_control sc;
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
+ bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
}
if (attr == &sysfs_trigger_btree_key_cache_shrink) {
@@ -456,7 +461,7 @@ STORE(bch2_fs)
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_key_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
+ c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc);
}
if (attr == &sysfs_trigger_gc)
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 2acdfa78..42f565c7 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -64,7 +64,7 @@ static int bch2_pow(u64 n, u64 p, u64 *res)
*res = 1;
while (p--) {
- if (*res > div_u64(U64_MAX, n))
+ if (*res > div64_u64(U64_MAX, n))
return -ERANGE;
*res *= n;
}
@@ -140,14 +140,14 @@ static int __bch2_strtou64_h(const char *cp, u64 *res)
parse_or_ret(cp, parse_unit_suffix(cp, &b));
- if (v > div_u64(U64_MAX, b))
+ if (v > div64_u64(U64_MAX, b))
return -ERANGE;
v *= b;
- if (f_n > div_u64(U64_MAX, b))
+ if (f_n > div64_u64(U64_MAX, b))
return -ERANGE;
- f_n = div_u64(f_n * b, f_d);
+ f_n = div64_u64(f_n * b, f_d);
if (v + f_n < v)
return -ERANGE;
v += f_n;
@@ -214,7 +214,7 @@ u64 bch2_read_flag_list(const char *opt, const char * const list[])
s = strim(d);
- while ((p = strsep(&s, ","))) {
+ while ((p = strsep(&s, ",;"))) {
int flag = match_string(list, -1, p);
if (flag < 0) {
@@ -360,7 +360,7 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = bch2_pick_time_units(ns);
- prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+ prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name);
}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
@@ -477,7 +477,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
u64 q = max(quantiles->entries[i].m, last_q);
- prt_printf(out, "%llu ", div_u64(q, u->nsecs));
+ prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
if (is_last)
prt_newline(out);
last_q = q;
diff --git a/libbcachefs/xattr_format.h b/libbcachefs/xattr_format.h
index e9f81053..c7916011 100644
--- a/libbcachefs/xattr_format.h
+++ b/libbcachefs/xattr_format.h
@@ -13,7 +13,7 @@ struct bch_xattr {
__u8 x_type;
__u8 x_name_len;
__le16 x_val_len;
- __u8 x_name[];
+ __u8 x_name[] __counted_by(x_name_len);
} __packed __aligned(8);
#endif /* _BCACHEFS_XATTR_FORMAT_H */