1 files changed, 167 insertions, 68 deletions
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6beff8810c09..907d1b605cf4 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_H
 #define _BCACHEFS_H
 
@@ -183,6 +184,7 @@
 #include <linux/closure.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
+#include <linux/math64.h>
 #include <linux/mutex.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
@@ -201,7 +203,7 @@
 
 #include <linux/dynamic_fault.h>
 
-#define bch2_fs_init_fault(name)						\
+#define bch2_fs_init_fault(name)					\
 	dynamic_fault("bcachefs:bch_fs_init:" name)
 #define bch2_meta_read_fault(name)					\
 	 dynamic_fault("bcachefs:meta:read:" name)
@@ -220,18 +222,22 @@
 	printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_warn(c, fmt, ...) \
 	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn_ratelimited(c, fmt, ...) \
+	printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err(c, fmt, ...) \
 	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_ratelimited(c, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
 
 #define bch_verbose(c, fmt, ...)					\
 do {									\
-	if ((c)->opts.verbose_recovery)					\
+	if ((c)->opts.verbose)						\
 		bch_info(c, fmt, ##__VA_ARGS__);			\
 } while (0)
 
 #define pr_verbose_init(opts, fmt, ...)					\
 do {									\
-	if (opt_get(opts, verbose_init))				\
+	if (opt_get(opts, verbose))					\
 		pr_info(fmt, ##__VA_ARGS__);				\
 } while (0)
 
@@ -252,6 +258,8 @@ do {									\
 	BCH_DEBUG_PARAM(expensive_debug_checks,				\
 		"Enables various runtime debugging checks that "	\
 		"significantly affect performance")			\
+	BCH_DEBUG_PARAM(debug_check_iterators,				\
+		"Enables extra verification for btree iterators")	\
 	BCH_DEBUG_PARAM(debug_check_bkeys,				\
 		"Run bkey_debugcheck (primarily checking GC/allocation "\
 		"information) when iterating over keys")		\
@@ -259,6 +267,25 @@ do {									\
 		"Reread btree nodes at various points to verify the "	\
 		"mergesort in the read path against modifications "	\
 		"done in memory")					\
+	BCH_DEBUG_PARAM(journal_seq_verify,				\
+		"Store the journal sequence number in the version "	\
+		"number of every btree key, and verify that btree "	\
+		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(inject_invalid_keys,				\
+		"Store the journal sequence number in the version "	\
+		"number of every btree key, and verify that btree "	\
+		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(test_alloc_startup,				\
+		"Force allocator startup to use the slowpath where it"	\
+		"can't find enough free buckets without invalidating"	\
+		"cached data")						\
+	BCH_DEBUG_PARAM(force_reconstruct_read,				\
+		"Force reads to use the reconstruct path, when reading"	\
+		"from erasure coded extents")				\
+	BCH_DEBUG_PARAM(test_restart_gc,				\
+		"Test restarting mark and sweep gc when bucket gens change")\
+	BCH_DEBUG_PARAM(test_reconstruct_alloc,				\
+		"Test reconstructing the alloc btree")
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
@@ -270,10 +297,11 @@ do {									\
 
 #define BCH_TIME_STATS()			\
 	x(btree_node_mem_alloc)			\
+	x(btree_node_split)			\
+	x(btree_node_sort)			\
+	x(btree_node_read)			\
 	x(btree_gc)				\
-	x(btree_split)				\
-	x(btree_sort)				\
-	x(btree_read)				\
+	x(btree_update)				\
 	x(btree_lock_contended_read)		\
 	x(btree_lock_contended_intent)		\
 	x(btree_lock_contended_write)		\
@@ -282,8 +310,10 @@ do {									\
 	x(data_promote)				\
 	x(journal_write)			\
 	x(journal_delay)			\
-	x(journal_blocked)			\
-	x(journal_flush_seq)
+	x(journal_flush_seq)			\
+	x(blocked_journal)			\
+	x(blocked_allocate)			\
+	x(blocked_allocate_open_bucket)
 
 enum bch_time_stats {
 #define x(name) BCH_TIME_##name,
@@ -296,35 +326,42 @@ enum bch_time_stats {
 #include "btree_types.h"
 #include "buckets_types.h"
 #include "clock_types.h"
+#include "ec_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
 #include "quota_types.h"
 #include "rebalance_types.h"
+#include "replicas_types.h"
 #include "super_types.h"
 
-/*
- * Number of nodes we might have to allocate in a worst case btree split
- * operation - we split all the way up to the root, then allocate a new root.
- */
-#define btree_reserve_required_nodes(depth)	(((depth) + 1) * 2 + 1)
-
 /* Number of nodes btree coalesce will try to coalesce at once */
 #define GC_MERGE_NODES		4U
 
 /* Maximum number of nodes we might need to allocate atomically: */
-#define BTREE_RESERVE_MAX						\
-	(btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES)
+#define BTREE_RESERVE_MAX	(BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
 
 /* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE		(BTREE_RESERVE_MAX * 4)
+#define BTREE_NODE_RESERVE	BTREE_RESERVE_MAX
+
+#define BTREE_NODE_OPEN_BUCKET_RESERVE	(BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
 
 struct btree;
 
 enum gc_phase {
-	GC_PHASE_SB		= BTREE_ID_NR + 1,
+	GC_PHASE_NOT_RUNNING,
+	GC_PHASE_START,
+	GC_PHASE_SB,
+
+	GC_PHASE_BTREE_EC,
+	GC_PHASE_BTREE_EXTENTS,
+	GC_PHASE_BTREE_INODES,
+	GC_PHASE_BTREE_DIRENTS,
+	GC_PHASE_BTREE_XATTRS,
+	GC_PHASE_BTREE_ALLOC,
+	GC_PHASE_BTREE_QUOTAS,
+
 	GC_PHASE_PENDING_DELETE,
 	GC_PHASE_ALLOC,
-	GC_PHASE_DONE
 };
 
 struct gc_pos {
@@ -356,6 +393,7 @@ struct bch_dev {
 	char			name[BDEVNAME_SIZE];
 
 	struct bch_sb_handle	disk_sb;
+	struct bch_sb		*sb_read_scratch;
 	int			sb_write_error;
 
 	struct bch_devs_mask	self;
@@ -365,18 +403,16 @@ struct bch_dev {
 
 	/*
 	 * Buckets:
-	 * Per-bucket arrays are protected by c->usage_lock, bucket_lock and
+	 * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
 	 * gc_lock, for device resize - holding any is sufficient for access:
 	 * Or rcu_read_lock(), but only for ptr_stale():
 	 */
-	struct bucket_array __rcu *buckets;
-	unsigned long		*buckets_dirty;
-	/* most out of date gen in the btree */
-	u8			*oldest_gens;
+	struct bucket_array __rcu *buckets[2];
+	unsigned long		*buckets_nouse;
+	unsigned long		*buckets_written;
 	struct rw_semaphore	bucket_lock;
 
-	struct bch_dev_usage __percpu *usage_percpu;
-	struct bch_dev_usage	usage_cached;
+	struct bch_dev_usage __percpu *usage[2];
 
 	/* Allocator: */
 	struct task_struct __rcu *alloc_thread;
@@ -393,7 +429,6 @@ struct bch_dev {
 	alloc_fifo		free[RESERVE_NR];
 	alloc_fifo		free_inc;
 	spinlock_t		freelist_lock;
-	size_t			nr_invalidated;
 
 	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
 	unsigned		open_buckets_partial_nr;
@@ -403,12 +438,19 @@ struct bch_dev {
 	/* last calculated minimum prio */
 	u16			max_last_bucket_io[2];
 
-	atomic_long_t		saturated_count;
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
-	u64			allocator_journal_seq_flush;
-	bool			allocator_invalidating_data;
-	bool			allocator_blocked;
+
+	/*
+	 * XXX: this should be an enum for allocator state, so as to include
+	 * error state
+	 */
+	enum {
+		ALLOCATOR_STOPPED,
+		ALLOCATOR_RUNNING,
+		ALLOCATOR_BLOCKED,
+		ALLOCATOR_BLOCKED_FULL,
+	}			allocator_state;
 
 	alloc_heap		alloc_heap;
 
@@ -417,6 +459,7 @@ struct bch_dev {
 	copygc_heap		copygc_heap;
 	struct bch_pd_controller copygc_pd;
 	struct write_point	copygc_write_point;
+	u64			copygc_threshold;
 
 	atomic64_t		rebalance_work;
 
@@ -435,33 +478,27 @@ struct bch_dev {
 	struct io_count __percpu *io_done;
 };
 
-/*
- * Flag bits for what phase of startup/shutdown the cache set is at, how we're
- * shutting down, etc.:
- *
- * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
- * all the backing devices first (their cached data gets invalidated, and they
- * won't automatically reattach).
- */
 enum {
 	/* startup: */
 	BCH_FS_ALLOC_READ_DONE,
 	BCH_FS_ALLOCATOR_STARTED,
+	BCH_FS_ALLOCATOR_RUNNING,
 	BCH_FS_INITIAL_GC_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_STARTED,
+	BCH_FS_RW,
 
 	/* shutdown: */
+	BCH_FS_STOPPING,
 	BCH_FS_EMERGENCY_RO,
 	BCH_FS_WRITE_DISABLE_COMPLETE,
 
 	/* errors: */
 	BCH_FS_ERROR,
-	BCH_FS_GC_FAILURE,
+	BCH_FS_ERRORS_FIXED,
 
 	/* misc: */
 	BCH_FS_BDEV_MOUNTED,
-	BCH_FS_FSCK_FIXED_ERRORS,
 	BCH_FS_FIXED_GENS,
 	BCH_FS_REBUILD_REPLICAS,
 	BCH_FS_HOLD_BTREE_WRITES,
@@ -474,11 +511,17 @@ struct btree_debug {
 	struct dentry		*failed;
 };
 
-enum bch_fs_state {
-	BCH_FS_STARTING		= 0,
-	BCH_FS_STOPPING,
-	BCH_FS_RO,
-	BCH_FS_RW,
+struct bch_fs_pcpu {
+	u64			sectors_available;
+};
+
+struct journal_seq_blacklist_table {
+	size_t			nr;
+	struct journal_seq_blacklist_table_entry {
+		u64		start;
+		u64		end;
+		bool		dirty;
+	}			entries[0];
 };
 
 struct bch_fs {
@@ -498,7 +541,6 @@ struct bch_fs {
 
 	/* ro/rw, add/remove devices: */
 	struct mutex		state_lock;
-	enum bch_fs_state	state;
 
 	/* Counts outstanding writes, for clean transition to read-only */
 	struct percpu_ref	writes;
@@ -506,10 +548,12 @@ struct bch_fs {
 
 	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
 
-	struct bch_replicas_cpu __rcu *replicas;
-	struct bch_replicas_cpu __rcu *replicas_gc;
+	struct bch_replicas_cpu replicas;
+	struct bch_replicas_cpu replicas_gc;
 	struct mutex		replicas_gc_lock;
 
+	struct journal_entry_res replicas_journal_res;
+
 	struct bch_disk_groups_cpu __rcu *disk_groups;
 
 	struct bch_opts		opts;
@@ -519,6 +563,7 @@ struct bch_fs {
 		uuid_le		uuid;
 		uuid_le		user_uuid;
 
+		u16		version;
 		u16		encoded_extent_max;
 
 		u8		nr_devices;
@@ -530,6 +575,7 @@ struct bch_fs {
 		u32		time_base_hi;
 		u32		time_precision;
 		u64		features;
+		u64		compat;
 	}			sb;
 
 	struct bch_sb_handle	disk_sb;
@@ -568,9 +614,12 @@ struct bch_fs {
 	struct mutex		btree_interior_update_lock;
 	struct closure_waitlist	btree_interior_update_wait;
 
+	mempool_t		btree_iters_pool;
+
 	struct workqueue_struct	*wq;
 	/* copygc needs its own workqueue for index updates.. */
 	struct workqueue_struct	*copygc_wq;
+	struct workqueue_struct	*journal_reclaim_wq;
 
 	/* ALLOCATION */
 	struct delayed_work	pd_controllers_update;
@@ -586,14 +635,22 @@ struct bch_fs {
 	 * and forces them to be revalidated
 	 */
 	u32			capacity_gen;
+	unsigned		bucket_size_max;
 
 	atomic64_t		sectors_available;
 
-	struct bch_fs_usage __percpu *usage_percpu;
-	struct bch_fs_usage	usage_cached;
-	struct percpu_rw_semaphore usage_lock;
+	struct bch_fs_pcpu __percpu	*pcpu;
 
-	struct closure_waitlist	freelist_wait;
+	struct percpu_rw_semaphore	mark_lock;
+
+	seqcount_t			usage_lock;
+	struct bch_fs_usage		*usage_base;
+	struct bch_fs_usage __percpu	*usage[2];
+	struct bch_fs_usage __percpu	*usage_gc;
+
+	/* single element mempool: */
+	struct mutex		usage_scratch_lock;
+	struct bch_fs_usage	*usage_scratch;
 
 	/*
 	 * When we invalidate buckets, we use both the priority and the amount
@@ -605,8 +662,16 @@ struct bch_fs {
 
 	struct io_clock		io_clock[2];
 
+	/* JOURNAL SEQ BLACKLIST */
+	struct journal_seq_blacklist_table *
+				journal_seq_blacklist_table;
+	struct work_struct	journal_seq_blacklist_gc_work;
+
 	/* ALLOCATOR */
 	spinlock_t		freelist_lock;
+	struct closure_waitlist	freelist_wait;
+	u64			blocked_allocate;
+	u64			blocked_allocate_open_bucket;
 	u8			open_buckets_freelist;
 	u8			open_buckets_nr_free;
 	struct closure_waitlist	open_buckets_wait;
@@ -615,9 +680,10 @@ struct bch_fs {
 	struct write_point	btree_write_point;
 	struct write_point	rebalance_write_point;
 
-	struct write_point	write_points[WRITE_POINT_COUNT];
-	struct hlist_head	write_points_hash[WRITE_POINT_COUNT];
+	struct write_point	write_points[WRITE_POINT_MAX];
+	struct hlist_head	write_points_hash[WRITE_POINT_HASH_NR];
 	struct mutex		write_points_hash_lock;
+	unsigned		write_points_nr;
 
 	/* GARBAGE COLLECTION */
 	struct task_struct	*gc_thread;
@@ -630,9 +696,6 @@ struct bch_fs {
 	 *
 	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
 	 *
-	 * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
-	 * currently running, and gc marks are currently valid
-	 *
 	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
 	 * can read without a lock.
 	 */
@@ -659,7 +722,7 @@ struct bch_fs {
 	ZSTD_parameters		zstd_params;
 
 	struct crypto_shash	*sha256;
-	struct crypto_skcipher	*chacha20;
+	struct crypto_sync_skcipher *chacha20;
 	struct crypto_shash	*poly1305;
 
 	atomic64_t		key_version;
@@ -667,6 +730,22 @@ struct bch_fs {
 	/* REBALANCE */
 	struct bch_fs_rebalance	rebalance;
 
+	/* STRIPES: */
+	GENRADIX(struct stripe) stripes[2];
+	struct mutex		ec_stripe_create_lock;
+
+	ec_stripes_heap		ec_stripes_heap;
+	spinlock_t		ec_stripes_heap_lock;
+
+	/* ERASURE CODING */
+	struct list_head	ec_new_stripe_list;
+	struct mutex		ec_new_stripe_lock;
+
+	struct bio_set		ec_bioset;
+
+	struct work_struct	ec_stripe_delete_work;
+	struct llist_head	ec_stripe_delete_list;
+
 	/* VFS IO PATH - fs-io.c */
 	struct bio_set		writepage_bioset;
 	struct bio_set		dio_write_bioset;
@@ -681,9 +760,6 @@ struct bch_fs {
 	struct mutex		fsck_error_lock;
 	bool			fsck_alloc_err;
 
-	/* FILESYSTEM */
-	atomic_long_t		nr_inodes;
-
 	/* QUOTAS */
 	struct bch_memquota_type quotas[QTYP_NR];
 
@@ -708,7 +784,7 @@ struct bch_fs {
 
 	struct journal		journal;
 
-	unsigned		bucket_journal_seq;
+	u64			last_bucket_seq_cleanup;
 
 	/* The rest of this all shows up in sysfs */
 	atomic_long_t		read_realloc_races;
@@ -734,11 +810,6 @@ static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
 #endif
 }
 
-static inline bool bch2_fs_running(struct bch_fs *c)
-{
-	return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
-}
-
 static inline unsigned bucket_bytes(const struct bch_dev *ca)
 {
 	return ca->mi.bucket_size << 9;
@@ -749,4 +820,32 @@ static inline unsigned block_bytes(const struct bch_fs *c)
 	return c->opts.block_size << 9;
 }
 
+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
+{
+	return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
+}
+
+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
+{
+	s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
+
+	if (c->sb.time_precision == 1)
+		return ns;
+
+	return div_s64(ns, c->sb.time_precision);
+}
+
+static inline s64 bch2_current_time(struct bch_fs *c)
+{
+	struct timespec64 now;
+
+	ktime_get_coarse_real_ts64(&now);
+	return timespec_to_bch2_time(c, now);
+}
+
+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
+{
+	return dev < c->sb.nr_devices && c->devs[dev];
+}
+
 #endif /* _BCACHEFS_H */