Update bcachefs sources to 18686af684 bcachefs: Inode backpointersv0.13

author: Kent Overstreet <kent.overstreet@gmail.com> 2021-03-28 17:38:28 -0400
committer: Kent Overstreet <kent.overstreet@gmail.com> 2021-03-29 00:22:38 -0400
commit: a2094890a90a2f865e49f94e8448deca7e5852ef (patch)
tree: 11bf5f426509e288b2b3482492c805a26bb1885a
parent: bb6eccc2ecd4728871bfc70462d3a4a20daa9d68 (diff)
51 files changed, 3435 insertions, 766 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 976139a3..385c19f6 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-ad68801b939cdda0530f54cd07b3212e98fe1d75
+18686af68412ebfad9c2adc6ee976ffdb9e1b886
diff --git a/Makefile b/Makefile
index 6999b93a..3fe96048 100644
--- a/Makefile
+++ b/Makefile
@@ -156,6 +156,10 @@ update-bcachefs-sources:
 	git add linux/six.c
 	cp $(LINUX_DIR)/include/linux/six.h include/linux/
 	git add include/linux/six.h
+	cp $(LINUX_DIR)/include/linux/list_nulls.h include/linux/
+	git add include/linux/list_nulls.h
+	cp $(LINUX_DIR)/include/linux/poison.h include/linux/
+	git add include/linux/poison.h
 	$(RM) libbcachefs/*.mod.c
 	git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision
 	git add .bcachefs_revision
diff --git a/cmd_debug.c b/cmd_debug.c
index 3baa6978..4938ec07 100644
--- a/cmd_debug.c
+++ b/cmd_debug.c
@@ -323,9 +323,7 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b)
 			le64_to_cpu(i->journal_seq));
 		offset += sectors;
 
-		for (k = i->start;
-		     k != vstruct_last(i);
-		     k = bkey_next_skip_noops(k, vstruct_last(i))) {
+		for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) {
 			struct bkey u;
 			char buf[4096];
 
diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
new file mode 100644
index 00000000..fa6e8471
--- /dev/null
+++ b/include/linux/list_nulls.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_LIST_NULLS_H
+#define _LINUX_LIST_NULLS_H
+
+#include <linux/poison.h>
+#include <linux/const.h>
+
+/*
+ * Special version of lists, where end of list is not a NULL pointer,
+ * but a 'nulls' marker, which can have many different values.
+ * (up to 2^31 different values guaranteed on all platforms)
+ *
+ * In the standard hlist, termination of a list is the NULL pointer.
+ * In this special 'nulls' variant, we use the fact that objects stored in
+ * a list are aligned on a word (4 or 8 bytes alignment).
+ * We therefore use the last significant bit of 'ptr' :
+ * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
+ * Set to 0 : This is a pointer to some object (ptr)
+ */
+
+struct hlist_nulls_head {
+	struct hlist_nulls_node *first;
+};
+
+struct hlist_nulls_node {
+	struct hlist_nulls_node *next, **pprev;
+};
+#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
+#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
+	((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
+
+#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_nulls_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
+	})
+/**
+ * ptr_is_a_nulls - Test if a ptr is a nulls
+ * @ptr: ptr to be tested
+ *
+ */
+static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
+{
+	return ((unsigned long)ptr & 1);
+}
+
+/**
+ * get_nulls_value - Get the 'nulls' value of the end of chain
+ * @ptr: end of chain
+ *
+ * Should be called only if is_a_nulls(ptr);
+ */
+static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
+{
+	return ((unsigned long)ptr) >> 1;
+}
+
+/**
+ * hlist_nulls_unhashed - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not.
+ */
+static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
+{
+	return !h->pprev;
+}
+
+/**
+ * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not.  Unlike hlist_nulls_unhashed(), this
+ * function may be used locklessly.
+ */
+static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h)
+{
+	return !READ_ONCE(h->pprev);
+}
+
+static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
+{
+	return is_a_nulls(READ_ONCE(h->first));
+}
+
+static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
+					struct hlist_nulls_head *h)
+{
+	struct hlist_nulls_node *first = h->first;
+
+	n->next = first;
+	WRITE_ONCE(n->pprev, &h->first);
+	h->first = n;
+	if (!is_a_nulls(first))
+		WRITE_ONCE(first->pprev, &n->next);
+}
+
+static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
+{
+	struct hlist_nulls_node *next = n->next;
+	struct hlist_nulls_node **pprev = n->pprev;
+
+	WRITE_ONCE(*pprev, next);
+	if (!is_a_nulls(next))
+		WRITE_ONCE(next->pprev, pprev);
+}
+
+static inline void hlist_nulls_del(struct hlist_nulls_node *n)
+{
+	__hlist_nulls_del(n);
+	WRITE_ONCE(n->pprev, LIST_POISON2);
+}
+
+/**
+ * hlist_nulls_for_each_entry	- iterate over list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry(tpos, pos, head, member)		       \
+	for (pos = (head)->first;					       \
+	     (!is_a_nulls(pos)) &&					       \
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry_from(tpos, pos, member)	\
+	for (; (!is_a_nulls(pos)) && 				\
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+#endif
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
new file mode 100644
index 00000000..ef74051d
--- /dev/null
+++ b/include/linux/overflow.h
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+#ifndef __LINUX_OVERFLOW_H
+#define __LINUX_OVERFLOW_H
+
+#include <linux/compiler.h>
+#include <linux/limits.h>
+
+/*
+ * In the fallback code below, we need to compute the minimum and
+ * maximum values representable in a given type. These macros may also
+ * be useful elsewhere, so we provide them outside the
+ * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
+ *
+ * It would seem more obvious to do something like
+ *
+ * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
+ * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
+ *
+ * Unfortunately, the middle expressions, strictly speaking, have
+ * undefined behaviour, and at least some versions of gcc warn about
+ * the type_max expression (but not if -fsanitize=undefined is in
+ * effect; in that case, the warning is deferred to runtime...).
+ *
+ * The slightly excessive casting in type_min is to make sure the
+ * macros also produce sensible values for the exotic type _Bool. [The
+ * overflow checkers only almost work for _Bool, but that's
+ * a-feature-not-a-bug, since people shouldn't be doing arithmetic on
+ * _Bools. Besides, the gcc builtins don't allow _Bool* as third
+ * argument.]
+ *
+ * Idea stolen from
+ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
+ * credit to Christian Biere.
+ */
+#define is_signed_type(type)       (((type)(-1)) < (type)1)
+#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
+#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
+#define type_min(T) ((T)((T)-type_max(T)-(T)1))
+
+/*
+ * Avoids triggering -Wtype-limits compilation warning,
+ * while using unsigned data types to check a < 0.
+ */
+#define is_non_negative(a) ((a) > 0 || (a) == 0)
+#define is_negative(a) (!(is_non_negative(a)))
+
+/*
+ * Allows for effectively applying __must_check to a macro so we can have
+ * both the type-agnostic benefits of the macros while also being able to
+ * enforce that the return value is, in fact, checked.
+ */
+static inline bool __must_check __must_check_overflow(bool overflow)
+{
+	return unlikely(overflow);
+}
+
+#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
+/*
+ * For simplicity and code hygiene, the fallback code below insists on
+ * a, b and *d having the same type (similar to the min() and max()
+ * macros), whereas gcc's type-generic overflow checkers accept
+ * different types. Hence we don't just make check_add_overflow an
+ * alias for __builtin_add_overflow, but add type checks similar to
+ * below.
+ */
+#define check_add_overflow(a, b, d) __must_check_overflow(({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_add_overflow(__a, __b, __d);	\
+}))
+
+#define check_sub_overflow(a, b, d) __must_check_overflow(({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_sub_overflow(__a, __b, __d);	\
+}))
+
+#define check_mul_overflow(a, b, d) __must_check_overflow(({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_mul_overflow(__a, __b, __d);	\
+}))
+
+#else
+
+
+/* Checking for unsigned overflow is relatively easy without causing UB. */
+#define __unsigned_add_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = __a + __b;			\
+	*__d < __a;				\
+})
+#define __unsigned_sub_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = __a - __b;			\
+	__a < __b;				\
+})
+/*
+ * If one of a or b is a compile-time constant, this avoids a division.
+ */
+#define __unsigned_mul_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);				\
+	typeof(b) __b = (b);				\
+	typeof(d) __d = (d);				\
+	(void) (&__a == &__b);				\
+	(void) (&__a == __d);				\
+	*__d = __a * __b;				\
+	__builtin_constant_p(__b) ?			\
+	  __b > 0 && __a > type_max(typeof(__a)) / __b : \
+	  __a > 0 && __b > type_max(typeof(__b)) / __a;	 \
+})
+
+/*
+ * For signed types, detecting overflow is much harder, especially if
+ * we want to avoid UB. But the interface of these macros is such that
+ * we must provide a result in *d, and in fact we must produce the
+ * result promised by gcc's builtins, which is simply the possibly
+ * wrapped-around value. Fortunately, we can just formally do the
+ * operations in the widest relevant unsigned type (u64) and then
+ * truncate the result - gcc is smart enough to generate the same code
+ * with and without the (u64) casts.
+ */
+
+/*
+ * Adding two signed integers can overflow only if they have the same
+ * sign, and overflow has happened iff the result has the opposite
+ * sign.
+ */
+#define __signed_add_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = (u64)__a + (u64)__b;		\
+	(((~(__a ^ __b)) & (*__d ^ __a))	\
+		& type_min(typeof(__a))) != 0;	\
+})
+
+/*
+ * Subtraction is similar, except that overflow can now happen only
+ * when the signs are opposite. In this case, overflow has happened if
+ * the result has the opposite sign of a.
+ */
+#define __signed_sub_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = (u64)__a - (u64)__b;		\
+	((((__a ^ __b)) & (*__d ^ __a))		\
+		& type_min(typeof(__a))) != 0;	\
+})
+
+/*
+ * Signed multiplication is rather hard. gcc always follows C99, so
+ * division is truncated towards 0. This means that we can write the
+ * overflow check like this:
+ *
+ * (a > 0 && (b > MAX/a || b < MIN/a)) ||
+ * (a < -1 && (b > MIN/a || b < MAX/a) ||
+ * (a == -1 && b == MIN)
+ *
+ * The redundant casts of -1 are to silence an annoying -Wtype-limits
+ * (included in -Wextra) warning: When the type is u8 or u16, the
+ * __b_c_e in check_mul_overflow obviously selects
+ * __unsigned_mul_overflow, but unfortunately gcc still parses this
+ * code and warns about the limited range of __b.
+ */
+
+#define __signed_mul_overflow(a, b, d) ({				\
+	typeof(a) __a = (a);						\
+	typeof(b) __b = (b);						\
+	typeof(d) __d = (d);						\
+	typeof(a) __tmax = type_max(typeof(a));				\
+	typeof(a) __tmin = type_min(typeof(a));				\
+	(void) (&__a == &__b);						\
+	(void) (&__a == __d);						\
+	*__d = (u64)__a * (u64)__b;					\
+	(__b > 0   && (__a > __tmax/__b || __a < __tmin/__b)) ||	\
+	(__b < (typeof(__b))-1  && (__a > __tmin/__b || __a < __tmax/__b)) || \
+	(__b == (typeof(__b))-1 && __a == __tmin);			\
+})
+
+
+#define check_add_overflow(a, b, d)	__must_check_overflow(		\
+	__builtin_choose_expr(is_signed_type(typeof(a)),		\
+			__signed_add_overflow(a, b, d),			\
+			__unsigned_add_overflow(a, b, d)))
+
+#define check_sub_overflow(a, b, d)	__must_check_overflow(		\
+	__builtin_choose_expr(is_signed_type(typeof(a)),		\
+			__signed_sub_overflow(a, b, d),			\
+			__unsigned_sub_overflow(a, b, d)))
+
+#define check_mul_overflow(a, b, d)	__must_check_overflow(		\
+	__builtin_choose_expr(is_signed_type(typeof(a)),		\
+			__signed_mul_overflow(a, b, d),			\
+			__unsigned_mul_overflow(a, b, d)))
+
+#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
+
+/** check_shl_overflow() - Calculate a left-shifted value and check overflow
+ *
+ * @a: Value to be shifted
+ * @s: How many bits left to shift
+ * @d: Pointer to where to store the result
+ *
+ * Computes *@d = (@a << @s)
+ *
+ * Returns true if '*d' cannot hold the result or when 'a << s' doesn't
+ * make sense. Example conditions:
+ * - 'a << s' causes bits to be lost when stored in *d.
+ * - 's' is garbage (e.g. negative) or so large that the result of
+ *   'a << s' is guaranteed to be 0.
+ * - 'a' is negative.
+ * - 'a << s' sets the sign bit, if any, in '*d'.
+ *
+ * '*d' will hold the results of the attempted shift, but is not
+ * considered "safe for use" if false is returned.
+ */
+#define check_shl_overflow(a, s, d) __must_check_overflow(({		\
+	typeof(a) _a = a;						\
+	typeof(s) _s = s;						\
+	typeof(d) _d = d;						\
+	u64 _a_full = _a;						\
+	unsigned int _to_shift =					\
+		is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0;	\
+	*_d = (_a_full << _to_shift);					\
+	(_to_shift != _s || is_negative(*_d) || is_negative(_a) ||	\
+	(*_d >> _to_shift) != _a);					\
+}))
+
+/**
+ * array_size() - Calculate size of 2-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ *
+ * Calculates size of 2-dimensional array: @a * @b.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array_size(size_t a, size_t b)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(a, b, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/**
+ * array3_size() - Calculate size of 3-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ * @c: dimension three
+ *
+ * Calculates size of 3-dimensional array: @a * @b * @c.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(a, b, &bytes))
+		return SIZE_MAX;
+	if (check_mul_overflow(bytes, c, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/*
+ * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for
+ * struct_size() below.
+ */
+static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(a, b, &bytes))
+		return SIZE_MAX;
+	if (check_add_overflow(bytes, c, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/**
+ * struct_size() - Calculate size of structure with trailing array.
+ * @p: Pointer to the structure.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of memory needed for structure @p followed by an
+ * array of @count number of @member elements.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define struct_size(p, member, count)					\
+	__ab_c_size(count,						\
+		    sizeof(*(p)->member) + __must_be_array((p)->member),\
+		    sizeof(*(p)))
+
+/**
+ * flex_array_size() - Calculate size of a flexible array member
+ *                     within an enclosing structure.
+ *
+ * @p: Pointer to the structure.
+ * @member: Name of the flexible array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of a flexible array of @count number of @member
+ * elements, at the end of structure @p.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define flex_array_size(p, member, count)				\
+	array_size(count,						\
+		    sizeof(*(p)->member) + __must_be_array((p)->member))
+
+#endif /* __LINUX_OVERFLOW_H */
diff --git a/include/linux/poison.h b/include/linux/poison.h
new file mode 100644
index 00000000..dc8ae5d8
--- /dev/null
+++ b/include/linux/poison.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_POISON_H
+#define _LINUX_POISON_H
+
+/********** include/linux/list.h **********/
+
+/*
+ * Architectures might want to move the poison pointer offset
+ * into some well-recognized area such as 0xdead000000000000,
+ * that is also not mappable by user-space exploits:
+ */
+#ifdef CONFIG_ILLEGAL_POINTER_VALUE
+# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL)
+#else
+# define POISON_POINTER_DELTA 0
+#endif
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x122 + POISON_POINTER_DELTA)
+
+/********** include/linux/timer.h **********/
+#define TIMER_ENTRY_STATIC	((void *) 0x300 + POISON_POINTER_DELTA)
+
+/********** mm/page_poison.c **********/
+#ifdef CONFIG_PAGE_POISONING_ZERO
+#define PAGE_POISON 0x00
+#else
+#define PAGE_POISON 0xaa
+#endif
+
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING	((void *) 0x400 + POISON_POINTER_DELTA)
+
+/********** mm/slab.c **********/
+/*
+ * Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define	RED_INACTIVE	0x09F911029D74E35BULL	/* when obj is inactive */
+#define	RED_ACTIVE	0xD84156C5635688C0ULL	/* when obj is active */
+
+#define SLUB_RED_INACTIVE	0xbb
+#define SLUB_RED_ACTIVE		0xcc
+
+/* ...and for poisoning */
+#define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
+#define POISON_FREE	0x6b	/* for use-after-free poisoning */
+#define	POISON_END	0xa5	/* end-byte of poisoning */
+
+/********** arch/$ARCH/mm/init.c **********/
+#define POISON_FREE_INITMEM	0xcc
+
+/********** arch/ia64/hp/common/sba_iommu.c **********/
+/*
+ * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
+ * value of "SBAIOMMU POISON\0" for spill-over poisoning.
+ */
+
+/********** fs/jbd/journal.c **********/
+#define JBD_POISON_FREE		0x5b
+#define JBD2_POISON_FREE	0x5c
+
+/********** drivers/base/dmapool.c **********/
+#define	POOL_POISON_FREED	0xa7	/* !inuse */
+#define	POOL_POISON_ALLOCATED	0xa9	/* !initted */
+
+/********** drivers/atm/ **********/
+#define ATM_POISON_FREE		0x12
+#define ATM_POISON		0xdeadbeef
+
+/********** kernel/mutexes **********/
+#define MUTEX_DEBUG_INIT	0x11
+#define MUTEX_DEBUG_FREE	0x22
+#define MUTEX_POISON_WW_CTX	((void *) 0x500 + POISON_POINTER_DELTA)
+
+/********** security/ **********/
+#define KEY_DESTROY		0xbd
+
+#endif
diff --git a/include/linux/random.h b/include/linux/random.h
index c38ae46d..28c595a0 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -45,6 +45,7 @@ static inline type get_random_##type(void)		\
 
 get_random_type(int);
 get_random_type(long);
+get_random_type(u32);
 get_random_type(u64);
 
 #endif /* _LINUX_RANDOM_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index c99d78a8..ae292241 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -13,4 +13,32 @@
 
 #define RCU_INIT_POINTER(p, v)		WRITE_ONCE(p, v)
 
+/* Has the specified rcu_head structure been handed to call_rcu()? */
+
+/**
+ * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
+ * @rhp: The rcu_head structure to initialize.
+ *
+ * If you intend to invoke rcu_head_after_call_rcu() to test whether a
+ * given rcu_head structure has already been passed to call_rcu(), then
+ * you must also invoke this rcu_head_init() function on it just after
+ * allocating that structure.  Calls to this function must not race with
+ * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
+ */
+static inline void rcu_head_init(struct rcu_head *rhp)
+{
+	rhp->func = (void *)~0L;
+}
+
+static inline bool
+rcu_head_after_call_rcu(struct rcu_head *rhp,
+			void (*f)(struct rcu_head *head))
+{
+	void (*func)(struct rcu_head *head) = READ_ONCE(rhp->func);
+
+	if (func == f)
+		return true;
+	return false;
+}
+
 #endif /* __TOOLS_LINUX_RCUPDATE_H */
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
new file mode 100644
index 00000000..57467cbf
--- /dev/null
+++ b/include/linux/rhashtable-types.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Resizable, Scalable, Concurrent Hash Table
+ *
+ * Simple structures that might be needed in include
+ * files.
+ */
+
+#ifndef _LINUX_RHASHTABLE_TYPES_H
+#define _LINUX_RHASHTABLE_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+struct rhash_head {
+	struct rhash_head __rcu		*next;
+};
+
+struct rhlist_head {
+	struct rhash_head		rhead;
+	struct rhlist_head __rcu	*next;
+};
+
+struct bucket_table;
+
+/**
+ * struct rhashtable_compare_arg - Key for the function rhashtable_compare
+ * @ht: Hash table
+ * @key: Key to compare against
+ */
+struct rhashtable_compare_arg {
+	struct rhashtable *ht;
+	const void *key;
+};
+
+typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
+			       const void *obj);
+
+/**
+ * struct rhashtable_params - Hash table construction parameters
+ * @nelem_hint: Hint on number of elements, should be 75% of desired size
+ * @key_len: Length of key
+ * @key_offset: Offset of key in struct to be hashed
+ * @head_offset: Offset of rhash_head in struct to be hashed
+ * @max_size: Maximum size while expanding
+ * @min_size: Minimum size while shrinking
+ * @automatic_shrinking: Enable automatic shrinking of tables
+ * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
+ * @obj_hashfn: Function to hash object
+ * @obj_cmpfn: Function to compare key with object
+ */
+struct rhashtable_params {
+	u16			nelem_hint;
+	u16			key_len;
+	u16			key_offset;
+	u16			head_offset;
+	unsigned int		max_size;
+	u16			min_size;
+	bool			automatic_shrinking;
+	rht_hashfn_t		hashfn;
+	rht_obj_hashfn_t	obj_hashfn;
+	rht_obj_cmpfn_t		obj_cmpfn;
+};
+
+/**
+ * struct rhashtable - Hash table handle
+ * @tbl: Bucket table
+ * @key_len: Key length for hashfn
+ * @max_elems: Maximum number of elements in table
+ * @p: Configuration parameters
+ * @rhlist: True if this is an rhltable
+ * @run_work: Deferred worker to expand/shrink asynchronously
+ * @mutex: Mutex to protect current/future table swapping
+ * @lock: Spin lock to protect walker list
+ * @nelems: Number of elements in table
+ */
+struct rhashtable {
+	struct bucket_table __rcu	*tbl;
+	unsigned int			key_len;
+	unsigned int			max_elems;
+	struct rhashtable_params	p;
+	bool				rhlist;
+	struct work_struct		run_work;
+	struct mutex                    mutex;
+	spinlock_t			lock;
+	atomic_t			nelems;
+};
+
+/**
+ * struct rhltable - Hash table with duplicate objects in a list
+ * @ht: Underlying rhtable
+ */
+struct rhltable {
+	struct rhashtable ht;
+};
+
+/**
+ * struct rhashtable_walker - Hash table walker
+ * @list: List entry on list of walkers
+ * @tbl: The table that we were walking over
+ */
+struct rhashtable_walker {
+	struct list_head list;
+	struct bucket_table *tbl;
+};
+
+/**
+ * struct rhashtable_iter - Hash table iterator
+ * @ht: Table to iterate through
+ * @p: Current pointer
+ * @list: Current hash list pointer
+ * @walker: Associated rhashtable walker
+ * @slot: Current slot
+ * @skip: Number of entries to skip in slot
+ */
+struct rhashtable_iter {
+	struct rhashtable *ht;
+	struct rhash_head *p;
+	struct rhlist_head *list;
+	struct rhashtable_walker walker;
+	unsigned int slot;
+	unsigned int skip;
+	bool end_of_table;
+};
+
+int rhashtable_init(struct rhashtable *ht,
+		    const struct rhashtable_params *params);
+int rhltable_init(struct rhltable *hlt,
+		  const struct rhashtable_params *params);
+
+#endif /* _LINUX_RHASHTABLE_TYPES_H */
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 8dbe1533..6cf8c257 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
- * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
  * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
  * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
  *
@@ -17,92 +18,93 @@
 #ifndef _LINUX_RHASHTABLE_H
 #define _LINUX_RHASHTABLE_H
 
-#include <linux/atomic.h>
-#include <linux/cache.h>
-#include <linux/compiler.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/jhash.h>
-#include <linux/workqueue.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
+#include <linux/list_nulls.h>
 #include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+#include <linux/bit_spinlock.h>
 
-#define RHT_BASE_BITS		4
-#define RHT_HASH_BITS		27
-#define RHT_BASE_SHIFT		RHT_HASH_BITS
-#define RHT_HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
+#define BIT(nr)			(1UL << (nr))
 
-struct rhash_head {
-	struct rhash_head __rcu		*next;
-};
+#include <linux/rhashtable-types.h>
+/*
+ * Objects in an rhashtable have an embedded struct rhash_head
+ * which is linked into as hash chain from the hash table - or one
+ * of two or more hash tables when the rhashtable is being resized.
+ * The end of the chain is marked with a special nulls marks which has
+ * the least significant bit set but otherwise stores the address of
+ * the hash bucket.  This allows us to be sure we've found the end
+ * of the right list.
+ * The value stored in the hash bucket has BIT(0) used as a lock bit.
+ * This bit must be atomically set before any changes are made to
+ * the chain.  To avoid dereferencing this pointer without clearing
+ * the bit first, we use an opaque 'struct rhash_lock_head *' for the
+ * pointer stored in the bucket.  This struct needs to be defined so
+ * that rcu_dereference() works on it, but it has no content so a
+ * cast is needed for it to be useful.  This ensures it isn't
+ * used by mistake with clearing the lock bit first.
+ */
+struct rhash_lock_head {};
 
+/* Maximum chain length before rehash
+ *
+ * The maximum (not average) chain length grows with the size of the hash
+ * table, at a rate of (log N)/(log log N).
+ *
+ * The value of 16 is selected so that even if the hash table grew to
+ * 2^32 you would not expect the maximum chain length to exceed it
+ * unless we are under attack (or extremely unlucky).
+ *
+ * As this limit is only to detect attacks, we don't need to set it to a
+ * lower value as you'd need the chain length to vastly exceed 16 to have
+ * any real effect on the system.
+ */
+#define RHT_ELASTICITY	16u
+
+/**
+ * struct bucket_table - Table of hash buckets
+ * @size: Number of hash buckets
+ * @nest: Number of bits of first-level nested table.
+ * @rehash: Current bucket being rehashed
+ * @hash_rnd: Random seed to fold into hash
+ * @walkers: List of active walkers
+ * @rcu: RCU structure for freeing the table
+ * @future_tbl: Table under construction during rehashing
+ * @ntbl: Nested table used when out of memory.
+ * @buckets: size * hash buckets
+ */
 struct bucket_table {
 	unsigned int		size;
-	unsigned int		rehash;
+	unsigned int		nest;
 	u32			hash_rnd;
-	unsigned int		locks_mask;
-	spinlock_t		*locks;
 	struct list_head	walkers;
 	struct rcu_head		rcu;
 
 	struct bucket_table __rcu *future_tbl;
 
-	struct rhash_head __rcu	*buckets[] ____cacheline_aligned_in_smp;
-};
-
-struct rhashtable_compare_arg {
-	struct rhashtable *ht;
-	const void *key;
+	struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
-typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
-			       const void *obj);
-
-struct rhashtable_params {
-	size_t			nelem_hint;
-	size_t			key_len;
-	size_t			key_offset;
-	size_t			head_offset;
-	unsigned int		insecure_max_entries;
-	unsigned int		max_size;
-	unsigned int		min_size;
-	u32			nulls_base;
-	bool			insecure_elasticity;
-	bool			automatic_shrinking;
-	size_t			locks_mul;
-	rht_hashfn_t		hashfn;
-	rht_obj_hashfn_t	obj_hashfn;
-	rht_obj_cmpfn_t		obj_cmpfn;
-};
-
-struct rhashtable {
-	struct bucket_table __rcu	*tbl;
-	atomic_t			nelems;
-	unsigned int			key_len;
-	unsigned int			elasticity;
-	struct rhashtable_params	p;
-	struct work_struct		run_work;
-	struct mutex                    mutex;
-	spinlock_t			lock;
-};
-
-struct rhashtable_walker {
-	struct list_head list;
-	struct bucket_table *tbl;
-};
-
-#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
-
-static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
-{
-	return NULLS_MARKER(ht->p.nulls_base + hash);
-}
-
-#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
-	((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+/*
+ * NULLS_MARKER() expects a hash value with the low
+ * bits mostly likely to be significant, and it discards
+ * the msb.
+ * We give it an address, in which the bottom bit is
+ * always 0, and the msb might be significant.
+ * So we shift the address down one bit to align with
+ * expectations and avoid losing a significant bit.
+ *
+ * We never store the NULLS_MARKER in the hash table
+ * itself as we need the lsb for locking.
+ * Instead we store a NULL
+ */
+#define	RHT_NULLS_MARKER(ptr)	\
+	((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
+#define INIT_RHT_NULLS_HEAD(ptr)	\
+	((ptr) = NULL)
 
 static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
 {
@@ -118,37 +120,45 @@ static inline void *rht_obj(const struct rhashtable *ht,
 static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
 					    unsigned int hash)
 {
-	return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1);
+	return hash & (tbl->size - 1);
 }
 
-static inline unsigned int rht_key_hashfn(
-	struct rhashtable *ht, const struct bucket_table *tbl,
-	const void *key, const struct rhashtable_params params)
+static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
+	const void *key, const struct rhashtable_params params,
+	unsigned int hash_rnd)
 {
 	unsigned int hash;
 
 	/* params must be equal to ht->p if it isn't constant. */
 	if (!__builtin_constant_p(params.key_len))
-		hash = ht->p.hashfn(key, ht->key_len, tbl->hash_rnd);
+		hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
 	else if (params.key_len) {
 		unsigned int key_len = params.key_len;
 
 		if (params.hashfn)
-			hash = params.hashfn(key, key_len, tbl->hash_rnd);
+			hash = params.hashfn(key, key_len, hash_rnd);
 		else if (key_len & (sizeof(u32) - 1))
-			hash = jhash(key, key_len, tbl->hash_rnd);
+			hash = jhash(key, key_len, hash_rnd);
 		else
-			hash = jhash2(key, key_len / sizeof(u32),
-				      tbl->hash_rnd);
+			hash = jhash2(key, key_len / sizeof(u32), hash_rnd);
 	} else {
 		unsigned int key_len = ht->p.key_len;
 
 		if (params.hashfn)
-			hash = params.hashfn(key, key_len, tbl->hash_rnd);
+			hash = params.hashfn(key, key_len, hash_rnd);
 		else
-			hash = jhash(key, key_len, tbl->hash_rnd);
+			hash = jhash(key, key_len, hash_rnd);
 	}
 
+	return hash;
+}
+
+static inline unsigned int rht_key_hashfn(
+	struct rhashtable *ht, const struct bucket_table *tbl,
+	const void *key, const struct rhashtable_params params)
+{
+	unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd);
+
 	return rht_bucket_index(tbl, hash);
 }
 
@@ -165,6 +175,11 @@ static inline unsigned int rht_head_hashfn(
 	       rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
 }
 
+/**
+ * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
+ * @ht:		hash table
+ * @tbl:	current table
+ */
 static inline bool rht_grow_above_75(const struct rhashtable *ht,
 				     const struct bucket_table *tbl)
 {
@@ -173,6 +188,11 @@ static inline bool rht_grow_above_75(const struct rhashtable *ht,
 	       (!ht->p.max_size || tbl->size < ht->p.max_size);
 }
 
+/**
+ * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
+ * @ht:		hash table
+ * @tbl:	current table
+ */
 static inline bool rht_shrink_below_30(const struct rhashtable *ht,
 				       const struct bucket_table *tbl)
 {
@@ -181,6 +201,11 @@ static inline bool rht_shrink_below_30(const struct rhashtable *ht,
 	       tbl->size > ht->p.min_size;
 }
 
+/**
+ * rht_grow_above_100 - returns true if nelems > table-size
+ * @ht:		hash table
+ * @tbl:	current table
+ */
 static inline bool rht_grow_above_100(const struct rhashtable *ht,
 				      const struct bucket_table *tbl)
 {
@@ -188,62 +213,353 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht,
 		(!ht->p.max_size || tbl->size < ht->p.max_size);
 }
 
+/**
+ * rht_grow_above_max - returns true if table is above maximum
+ * @ht:		hash table
+ * @tbl:	current table
+ */
 static inline bool rht_grow_above_max(const struct rhashtable *ht,
 				      const struct bucket_table *tbl)
 {
-	return ht->p.insecure_max_entries &&
-	       atomic_read(&ht->nelems) >= ht->p.insecure_max_entries;
+	return atomic_read(&ht->nelems) >= ht->max_elems;
 }
 
-static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl,
-					  unsigned int hash)
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rht_mutex_is_held(struct rhashtable *ht);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
+#else
+static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
 {
-	return &tbl->locks[hash & tbl->locks_mask];
+	return 1;
 }
 
-int rhashtable_insert_rehash(struct rhashtable *, struct bucket_table *);
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *,
-					    const void *,
-					    struct rhash_head *,
-					    struct bucket_table *);
+static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
+					     u32 hash)
+{
+	return 1;
+}
+#endif /* CONFIG_PROVE_LOCKING */
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+			     struct rhash_head *obj);
 
-int rhashtable_init(struct rhashtable *, const struct rhashtable_params *);
-void rhashtable_destroy(struct rhashtable *);
+void rhashtable_walk_enter(struct rhashtable *ht,
+			   struct rhashtable_iter *iter);
+void rhashtable_walk_exit(struct rhashtable_iter *iter);
+int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);
 
-#define rht_dereference(p, ht)			rcu_dereference(p)
-#define rht_dereference_rcu(p, ht)		rcu_dereference(p)
-#define rht_dereference_bucket(p, tbl, hash)	rcu_dereference(p)
-#define rht_dereference_bucket_rcu(p, tbl, hash) rcu_dereference(p)
+static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
+{
+	(void)rhashtable_walk_start_check(iter);
+}
+
+void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void *rhashtable_walk_peek(struct rhashtable_iter *iter);
+void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
+
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+				 void (*free_fn)(void *ptr, void *arg),
+				 void *arg);
+void rhashtable_destroy(struct rhashtable *ht);
+
+struct rhash_lock_head __rcu **rht_bucket_nested(
+	const struct bucket_table *tbl, unsigned int hash);
+struct rhash_lock_head __rcu **__rht_bucket_nested(
+	const struct bucket_table *tbl, unsigned int hash);
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(
+	struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);
+
+#define rht_dereference(p, ht) \
+	rcu_dereference(p)
+
+#define rht_dereference_rcu(p, ht) \
+	rcu_dereference(p)
+
+#define rht_dereference_bucket(p, tbl, hash) \
+	rcu_dereference(p)
+
+#define rht_dereference_bucket_rcu(p, tbl, hash) \
+	rcu_dereference(p)
 
 #define rht_entry(tpos, pos, member) \
 	({ tpos = container_of(pos, typeof(*tpos), member); 1; })
 
-#define rht_for_each_continue(pos, head, tbl, hash) \
-	for (pos = rht_dereference_bucket(head, tbl, hash); \
-	     !rht_is_a_nulls(pos); \
+static inline struct rhash_lock_head __rcu *const *rht_bucket(
+	const struct bucket_table *tbl, unsigned int hash)
+{
+	return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
+				     &tbl->buckets[hash];
+}
+
+static inline struct rhash_lock_head __rcu **rht_bucket_var(
+	struct bucket_table *tbl, unsigned int hash)
+{
+	return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
+				     &tbl->buckets[hash];
+}
+
+static inline struct rhash_lock_head __rcu **rht_bucket_insert(
+	struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+	return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
+				     &tbl->buckets[hash];
+}
+
+/*
+ * We lock a bucket by setting BIT(0) in the pointer - this is always
+ * zero in real pointers.  The NULLS mark is never stored in the bucket,
+ * rather we store NULL if the bucket is empty.
+ * bit_spin_locks do not handle contention well, but the whole point
+ * of the hashtable design is to achieve minimum per-bucket contention.
+ * A nested hash table might not have a bucket pointer.  In that case
+ * we cannot get a lock.  For remove and replace the bucket cannot be
+ * interesting and doesn't need locking.
+ * For insert we allocate the bucket if this is the last bucket_table,
+ * and then take the lock.
+ * Sometimes we unlock a bucket by writing a new pointer there.  In that
+ * case we don't need to unlock, but we do need to reset state such as
+ * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
+ * provides the same release semantics that bit_spin_unlock() provides,
+ * this is safe.
+ * When we write to a bucket without unlocking, we use rht_assign_locked().
+ */
+
+static inline void rht_lock(struct bucket_table *tbl,
+			    struct rhash_lock_head __rcu **bkt)
+{
+	bit_spin_lock(0, (unsigned long *)bkt);
+}
+
+static inline void rht_lock_nested(struct bucket_table *tbl,
+				   struct rhash_lock_head __rcu **bucket,
+				   unsigned int subclass)
+{
+	bit_spin_lock(0, (unsigned long *)bucket);
+}
+
+static inline void rht_unlock(struct bucket_table *tbl,
+			      struct rhash_lock_head __rcu **bkt)
+{
+	bit_spin_unlock(0, (unsigned long *)bkt);
+}
+
+static inline struct rhash_head *__rht_ptr(
+	struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
+{
+	return (struct rhash_head *)
+		((unsigned long)p & ~BIT(0) ?:
+		 (unsigned long)RHT_NULLS_MARKER(bkt));
+}
+
+/*
+ * Where 'bkt' is a bucket and might be locked:
+ *   rht_ptr_rcu() dereferences that pointer and clears the lock bit.
+ *   rht_ptr() dereferences in a context where the bucket is locked.
+ *   rht_ptr_exclusive() dereferences in a context where exclusive
+ *            access is guaranteed, such as when destroying the table.
+ */
+static inline struct rhash_head *rht_ptr_rcu(
+	struct rhash_lock_head __rcu *const *bkt)
+{
+	return __rht_ptr(rcu_dereference(*bkt), bkt);
+}
+
+static inline struct rhash_head *rht_ptr(
+	struct rhash_lock_head __rcu *const *bkt,
+	struct bucket_table *tbl,
+	unsigned int hash)
+{
+	return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
+}
+
+static inline struct rhash_head *rht_ptr_exclusive(
+	struct rhash_lock_head __rcu *const *bkt)
+{
+	return __rht_ptr(rcu_dereference(*bkt), bkt);
+}
+
+static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
+				     struct rhash_head *obj)
+{
+	if (rht_is_a_nulls(obj))
+		obj = NULL;
+	rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0)));
+}
+
+static inline void rht_assign_unlock(struct bucket_table *tbl,
+				     struct rhash_lock_head __rcu **bkt,
+				     struct rhash_head *obj)
+{
+	if (rht_is_a_nulls(obj))
+		obj = NULL;
+	rcu_assign_pointer(*bkt, (void *)obj);
+	preempt_enable();
+	__release(bitlock);
+}
+
+/**
+ * rht_for_each_from - iterate over hash chain from given head
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the &struct rhash_head to start from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ */
+#define rht_for_each_from(pos, head, tbl, hash) \
+	for (pos = head;			\
+	     !rht_is_a_nulls(pos);		\
 	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
+/**
+ * rht_for_each - iterate over hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ */
 #define rht_for_each(pos, tbl, hash) \
-	rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash)
+	rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash),  \
+			  tbl, hash)
+
+/**
+ * rht_for_each_entry_from - iterate over hash chain from given head
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the &struct rhash_head to start from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member)	\
+	for (pos = head;						\
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	\
+	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
-#define rht_for_each_rcu_continue(pos, head, tbl, hash)			\
+/**
+ * rht_for_each_entry - iterate over hash chain of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
+	rht_for_each_entry_from(tpos, pos,				\
+				rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
+				tbl, hash, member)
+
+/**
+ * rht_for_each_entry_safe - safely iterate over hash chain of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @next:	the &struct rhash_head to use as next in loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive allows for the looped code to
+ * remove the loop cursor from the list.
+ */
+#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	      \
+	for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash),		      \
+	     next = !rht_is_a_nulls(pos) ?				      \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	      \
+	     pos = next,						      \
+	     next = !rht_is_a_nulls(pos) ?				      \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL)
+
+/**
+ * rht_for_each_rcu_from - iterate over rcu hash chain from given head
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the &struct rhash_head to start from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu_from(pos, head, tbl, hash)			\
 	for (({barrier(); }),						\
-	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		\
+	     pos = head;						\
 	     !rht_is_a_nulls(pos);					\
 	     pos = rcu_dereference_raw(pos->next))
 
-#define rht_for_each_rcu(pos, tbl, hash)				\
-	rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash)
+/**
+ * rht_for_each_rcu - iterate over rcu hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu(pos, tbl, hash)			\
+	for (({barrier(); }),					\
+	     pos = rht_ptr_rcu(rht_bucket(tbl, hash));		\
+	     !rht_is_a_nulls(pos);				\
+	     pos = rcu_dereference_raw(pos->next))
 
-#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
+/**
+ * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the &struct rhash_head to start from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
 	for (({barrier(); }),						    \
-	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		    \
+	     pos = head;						    \
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
 	     pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
 
-#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		\
-	rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
-					tbl, hash, member)
+/**
+ * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		   \
+	rht_for_each_entry_rcu_from(tpos, pos,				   \
+				    rht_ptr_rcu(rht_bucket(tbl, hash)),	   \
+				    tbl, hash, member)
+
+/**
+ * rhl_for_each_rcu - iterate over rcu hash table list
+ * @pos:	the &struct rlist_head to use as a loop cursor.
+ * @list:	the head of the list
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_rcu(pos, list)					\
+	for (pos = list; pos; pos = rcu_dereference_raw(pos->next))
+
+/**
+ * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rlist_head to use as a loop cursor.
+ * @list:	the head of the list
+ * @member:	name of the &struct rlist_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_entry_rcu(tpos, pos, list, member)			\
+	for (pos = list; pos && rht_entry(tpos, pos, member);		\
+	     pos = rcu_dereference_raw(pos->next))
 
 static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
 				     const void *obj)
@@ -254,7 +570,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
 	return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
 }
 
-static inline void *rhashtable_lookup_fast(
+/* Internal function, do not use. */
+static inline struct rhash_head *__rhashtable_lookup(
 	struct rhashtable *ht, const void *key,
 	const struct rhashtable_params params)
 {
@@ -262,23 +579,27 @@ static inline void *rhashtable_lookup_fast(
 		.ht = ht,
 		.key = key,
 	};
-	const struct bucket_table *tbl;
+	struct rhash_lock_head __rcu *const *bkt;
+	struct bucket_table *tbl;
 	struct rhash_head *he;
 	unsigned int hash;
 
-	rcu_read_lock();
-
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 restart:
 	hash = rht_key_hashfn(ht, tbl, key, params);
-	rht_for_each_rcu(he, tbl, hash) {
-		if (params.obj_cmpfn ?
-		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
-		    rhashtable_compare(&arg, rht_obj(ht, he)))
-			continue;
-		rcu_read_unlock();
-		return rht_obj(ht, he);
-	}
+	bkt = rht_bucket(tbl, hash);
+	do {
+		rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
+			if (params.obj_cmpfn ?
+			    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+			    rhashtable_compare(&arg, rht_obj(ht, he)))
+				continue;
+			return he;
+		}
+		/* An object might have been moved to a different hash chain,
+		 * while we walk along it - better check and retry.
+		 */
+	} while (he != RHT_NULLS_MARKER(bkt));
 
 	/* Ensure we see any new tables. */
 	smp_rmb();
@@ -286,150 +607,594 @@ restart:
 	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 	if (unlikely(tbl))
 		goto restart;
-	rcu_read_unlock();
 
 	return NULL;
 }
 
-static inline int __rhashtable_insert_fast(
-	struct rhashtable *ht, const void *key, struct rhash_head *obj,
+/**
+ * rhashtable_lookup - search hash table
+ * @ht:		hash table
+ * @key:	the pointer to the key
+ * @params:	hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup(
+	struct rhashtable *ht, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhash_head *he = __rhashtable_lookup(ht, key, params);
+
+	return he ? rht_obj(ht, he) : NULL;
+}
+
+/**
+ * rhashtable_lookup_fast - search hash table, without RCU read lock
+ * @ht:		hash table
+ * @key:	the pointer to the key
+ * @params:	hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * Only use this function when you have other mechanisms guaranteeing
+ * that the object won't go away after the RCU read lock is released.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup_fast(
+	struct rhashtable *ht, const void *key,
+	const struct rhashtable_params params)
+{
+	void *obj;
+
+	rcu_read_lock();
+	obj = rhashtable_lookup(ht, key, params);
+	rcu_read_unlock();
+
+	return obj;
+}
+
+/**
+ * rhltable_lookup - search hash list table
+ * @hlt:	hash table
+ * @key:	the pointer to the key
+ * @params:	hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key.  All matching entries are returned
+ * in a list.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the list of entries that match the given key.
+ */
+static inline struct rhlist_head *rhltable_lookup(
+	struct rhltable *hlt, const void *key,
 	const struct rhashtable_params params)
 {
+	struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);
+
+	return he ? container_of(he, struct rhlist_head, rhead) : NULL;
+}
+
+/* Internal function, please use rhashtable_insert_fast() instead. This
+ * function returns the existing element already in hashes in there is a clash,
+ * otherwise it returns an error via ERR_PTR().
+ */
+static inline void *__rhashtable_insert_fast(
+	struct rhashtable *ht, const void *key, struct rhash_head *obj,
+	const struct rhashtable_params params, bool rhlist)
+{
 	struct rhashtable_compare_arg arg = {
 		.ht = ht,
 		.key = key,
 	};
-	struct bucket_table *tbl, *new_tbl;
+	struct rhash_lock_head __rcu **bkt;
+	struct rhash_head __rcu **pprev;
+	struct bucket_table *tbl;
 	struct rhash_head *head;
-	spinlock_t *lock;
-	unsigned int elasticity;
 	unsigned int hash;
-	int err;
+	int elasticity;
+	void *data;
 
-restart:
 	rcu_read_lock();
 
 	tbl = rht_dereference_rcu(ht->tbl, ht);
+	hash = rht_head_hashfn(ht, tbl, obj, params);
+	elasticity = RHT_ELASTICITY;
+	bkt = rht_bucket_insert(ht, tbl, hash);
+	data = ERR_PTR(-ENOMEM);
+	if (!bkt)
+		goto out;
+	pprev = NULL;
+	rht_lock(tbl, bkt);
 
-	/* All insertions must grab the oldest table containing
-	 * the hashed bucket that is yet to be rehashed.
-	 */
-	for (;;) {
-		hash = rht_head_hashfn(ht, tbl, obj, params);
-		lock = rht_bucket_lock(tbl, hash);
-		spin_lock_bh(lock);
+	if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
+slow_path:
+		rht_unlock(tbl, bkt);
+		rcu_read_unlock();
+		return rhashtable_insert_slow(ht, key, obj);
+	}
 
-		if (tbl->rehash <= hash)
-			break;
+	rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
+		struct rhlist_head *plist;
+		struct rhlist_head *list;
 
-		spin_unlock_bh(lock);
-		tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-	}
+		elasticity--;
+		if (!key ||
+		    (params.obj_cmpfn ?
+		     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
+		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
+			pprev = &head->next;
+			continue;
+		}
 
-	new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-	if (unlikely(new_tbl)) {
-		tbl = rhashtable_insert_slow(ht, key, obj, new_tbl);
-		if (!IS_ERR_OR_NULL(tbl))
-			goto slow_path;
+		data = rht_obj(ht, head);
 
-		err = PTR_ERR(tbl);
-		goto out;
-	}
+		if (!rhlist)
+			goto out_unlock;
 
-	err = -E2BIG;
-	if (unlikely(rht_grow_above_max(ht, tbl)))
-		goto out;
 
-	if (unlikely(rht_grow_above_100(ht, tbl))) {
-slow_path:
-		spin_unlock_bh(lock);
-		err = rhashtable_insert_rehash(ht, tbl);
-		rcu_read_unlock();
-		if (err)
-			return err;
+		list = container_of(obj, struct rhlist_head, rhead);
+		plist = container_of(head, struct rhlist_head, rhead);
 
-		goto restart;
+		RCU_INIT_POINTER(list->next, plist);
+		head = rht_dereference_bucket(head->next, tbl, hash);
+		RCU_INIT_POINTER(list->rhead.next, head);
+		if (pprev) {
+			rcu_assign_pointer(*pprev, obj);
+			rht_unlock(tbl, bkt);
+		} else
+			rht_assign_unlock(tbl, bkt, obj);
+		data = NULL;
+		goto out;
 	}
 
-	err = -EEXIST;
-	elasticity = ht->elasticity;
-	rht_for_each(head, tbl, hash) {
-		if (key &&
-		    unlikely(!(params.obj_cmpfn ?
-			       params.obj_cmpfn(&arg, rht_obj(ht, head)) :
-			       rhashtable_compare(&arg, rht_obj(ht, head)))))
-			goto out;
-		if (!--elasticity)
-			goto slow_path;
-	}
+	if (elasticity <= 0)
+		goto slow_path;
+
+	data = ERR_PTR(-E2BIG);
+	if (unlikely(rht_grow_above_max(ht, tbl)))
+		goto out_unlock;
 
-	err = 0;
+	if (unlikely(rht_grow_above_100(ht, tbl)))
+		goto slow_path;
 
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+	/* Inserting at head of list makes unlocking free. */
+	head = rht_ptr(bkt, tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
+	if (rhlist) {
+		struct rhlist_head *list;
 
-	rcu_assign_pointer(tbl->buckets[hash], obj);
+		list = container_of(obj, struct rhlist_head, rhead);
+		RCU_INIT_POINTER(list->next, NULL);
+	}
 
 	atomic_inc(&ht->nelems);
+	rht_assign_unlock(tbl, bkt, obj);
+
 	if (rht_grow_above_75(ht, tbl))
 		schedule_work(&ht->run_work);
 
+	data = NULL;
 out:
-	spin_unlock_bh(lock);
 	rcu_read_unlock();
 
-	return err;
+	return data;
+
+out_unlock:
+	rht_unlock(tbl, bkt);
+	goto out;
 }
 
+/**
+ * rhashtable_insert_fast - insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhashtable_insert_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	void *ret;
+
+	ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
+	if (IS_ERR(ret))
+		return PTR_ERR(ret);
+
+	return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhltable_insert_key - insert object into hash list table
+ * @hlt:	hash list table
+ * @key:	the pointer to the key
+ * @list:	pointer to hash list head inside object
+ * @params:	hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhltable_insert_key(
+	struct rhltable *hlt, const void *key, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+	return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+						params, true));
+}
+
+/**
+ * rhltable_insert - insert object into hash list table
+ * @hlt:	hash list table
+ * @list:	pointer to hash list head inside object
+ * @params:	hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhltable_insert(
+	struct rhltable *hlt, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+	const char *key = rht_obj(&hlt->ht, &list->rhead);
+
+	key += params.key_offset;
+
+	return rhltable_insert_key(hlt, key, list, params);
+}
+
+/**
+ * rhashtable_lookup_insert_fast - lookup and insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * This lookup function may only be used for fixed key hash table (key_len
+ * parameter set). It will BUG() if used inappropriately.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
 static inline int rhashtable_lookup_insert_fast(
 	struct rhashtable *ht, struct rhash_head *obj,
 	const struct rhashtable_params params)
 {
 	const char *key = rht_obj(ht, obj);
+	void *ret;
 
 	BUG_ON(ht->p.obj_hashfn);
 
-	return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj,
-					params);
+	ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+				       false);
+	if (IS_ERR(ret))
+		return PTR_ERR(ret);
+
+	return ret == NULL ? 0 : -EEXIST;
 }
 
-static inline int __rhashtable_remove_fast(
+/**
+ * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_fast(), but this function returns the
+ * object if it exists, NULL if it did not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	const char *key = rht_obj(ht, obj);
+
+	BUG_ON(ht->p.obj_hashfn);
+
+	return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+					false);
+}
+
+/**
+ * rhashtable_lookup_insert_key - search and insert object to hash table
+ *				  with explicit key
+ * @ht:		hash table
+ * @key:	key
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Lookups may occur in parallel with hashtable mutations and resizing.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ *
+ * Returns zero on success.
+ */
+static inline int rhashtable_lookup_insert_key(
+	struct rhashtable *ht, const void *key, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	void *ret;
+
+	BUG_ON(!ht->p.obj_hashfn || !key);
+
+	ret = __rhashtable_insert_fast(ht, key, obj, params, false);
+	if (IS_ERR(ret))
+		return PTR_ERR(ret);
+
+	return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
+ * @ht:		hash table
+ * @key:	key
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_key(), but this function returns the
+ * object if it exists, NULL if it does not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_key(
+	struct rhashtable *ht, const void *key, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	BUG_ON(!ht->p.obj_hashfn || !key);
+
+	return __rhashtable_insert_fast(ht, key, obj, params, false);
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast_one(
 	struct rhashtable *ht, struct bucket_table *tbl,
-	struct rhash_head *obj, const struct rhashtable_params params)
+	struct rhash_head *obj, const struct rhashtable_params params,
+	bool rhlist)
 {
+	struct rhash_lock_head __rcu **bkt;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
-	spinlock_t * lock;
 	unsigned int hash;
 	int err = -ENOENT;
 
 	hash = rht_head_hashfn(ht, tbl, obj, params);
-	lock = rht_bucket_lock(tbl, hash);
+	bkt = rht_bucket_var(tbl, hash);
+	if (!bkt)
+		return -ENOENT;
+	pprev = NULL;
+	rht_lock(tbl, bkt);
 
-	spin_lock_bh(lock);
+	rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+		struct rhlist_head *list;
+
+		list = container_of(he, struct rhlist_head, rhead);
 
-	pprev = &tbl->buckets[hash];
-	rht_for_each(he, tbl, hash) {
 		if (he != obj) {
+			struct rhlist_head __rcu **lpprev;
+
 			pprev = &he->next;
-			continue;
+
+			if (!rhlist)
+				continue;
+
+			do {
+				lpprev = &list->next;
+				list = rht_dereference_bucket(list->next,
+							      tbl, hash);
+			} while (list && obj != &list->rhead);
+
+			if (!list)
+				continue;
+
+			list = rht_dereference_bucket(list->next, tbl, hash);
+			RCU_INIT_POINTER(*lpprev, list);
+			err = 0;
+			break;
 		}
 
-		rcu_assign_pointer(*pprev, obj->next);
+		obj = rht_dereference_bucket(obj->next, tbl, hash);
+		err = 1;
+
+		if (rhlist) {
+			list = rht_dereference_bucket(list->next, tbl, hash);
+			if (list) {
+				RCU_INIT_POINTER(list->rhead.next, obj);
+				obj = &list->rhead;
+				err = 0;
+			}
+		}
+
+		if (pprev) {
+			rcu_assign_pointer(*pprev, obj);
+			rht_unlock(tbl, bkt);
+		} else {
+			rht_assign_unlock(tbl, bkt, obj);
+		}
+		goto unlocked;
+	}
+
+	rht_unlock(tbl, bkt);
+unlocked:
+	if (err > 0) {
+		atomic_dec(&ht->nelems);
+		if (unlikely(ht->p.automatic_shrinking &&
+			     rht_shrink_below_30(ht, tbl)))
+			schedule_work(&ht->run_work);
 		err = 0;
-		break;
 	}
 
-	spin_unlock_bh(lock);
+	return err;
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params, bool rhlist)
+{
+	struct bucket_table *tbl;
+	int err;
+
+	rcu_read_lock();
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	/* Because we have already taken (and released) the bucket
+	 * lock in old_tbl, if we find that future_tbl is not yet
+	 * visible then that guarantees the entry to still be in
+	 * the old tbl if it exists.
+	 */
+	while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
+						   rhlist)) &&
+	       (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
+		;
+
+	rcu_read_unlock();
 
 	return err;
 }
 
+/**
+ * rhashtable_remove_fast - remove object from hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
 static inline int rhashtable_remove_fast(
 	struct rhashtable *ht, struct rhash_head *obj,
 	const struct rhashtable_params params)
 {
+	return __rhashtable_remove_fast(ht, obj, params, false);
+}
+
+/**
+ * rhltable_remove - remove object from hash list table
+ * @hlt:	hash list table
+ * @list:	pointer to hash list head inside object
+ * @params:	hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
+static inline int rhltable_remove(
+	struct rhltable *hlt, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+	return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
+}
+
+/* Internal function, please use rhashtable_replace_fast() instead */
+static inline int __rhashtable_replace_fast(
+	struct rhashtable *ht, struct bucket_table *tbl,
+	struct rhash_head *obj_old, struct rhash_head *obj_new,
+	const struct rhashtable_params params)
+{
+	struct rhash_lock_head __rcu **bkt;
+	struct rhash_head __rcu **pprev;
+	struct rhash_head *he;
+	unsigned int hash;
+	int err = -ENOENT;
+
+	/* Minimally, the old and new objects must have same hash
+	 * (which should mean identifiers are the same).
+	 */
+	hash = rht_head_hashfn(ht, tbl, obj_old, params);
+	if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
+		return -EINVAL;
+
+	bkt = rht_bucket_var(tbl, hash);
+	if (!bkt)
+		return -ENOENT;
+
+	pprev = NULL;
+	rht_lock(tbl, bkt);
+
+	rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+		if (he != obj_old) {
+			pprev = &he->next;
+			continue;
+		}
+
+		rcu_assign_pointer(obj_new->next, obj_old->next);
+		if (pprev) {
+			rcu_assign_pointer(*pprev, obj_new);
+			rht_unlock(tbl, bkt);
+		} else {
+			rht_assign_unlock(tbl, bkt, obj_new);
+		}
+		err = 0;
+		goto unlocked;
+	}
+
+	rht_unlock(tbl, bkt);
+
+unlocked:
+	return err;
+}
+
+/**
+ * rhashtable_replace_fast - replace an object in hash table
+ * @ht:		hash table
+ * @obj_old:	pointer to hash head inside object being replaced
+ * @obj_new:	pointer to hash head inside object which is new
+ * @params:	hash table parameters
+ *
+ * Replacing an object doesn't affect the number of elements in the hash table
+ * or bucket, so we don't need to worry about shrinking or expanding the
+ * table here.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found,
+ * -EINVAL if hash is not the same for the old and new objects.
+ */
+static inline int rhashtable_replace_fast(
+	struct rhashtable *ht, struct rhash_head *obj_old,
+	struct rhash_head *obj_new,
+	const struct rhashtable_params params)
+{
 	struct bucket_table *tbl;
 	int err;
 
@@ -442,22 +1207,62 @@ static inline int rhashtable_remove_fast(
 	 * visible then that guarantees the entry to still be in
 	 * the old tbl if it exists.
 	 */
-	while ((err = __rhashtable_remove_fast(ht, tbl, obj, params)) &&
+	while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
+						obj_new, params)) &&
 	       (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
 		;
 
-	if (err)
-		goto out;
-
-	atomic_dec(&ht->nelems);
-	if (unlikely(ht->p.automatic_shrinking &&
-		     rht_shrink_below_30(ht, tbl)))
-		schedule_work(&ht->run_work);
-
-out:
 	rcu_read_unlock();
 
 	return err;
 }
 
+/**
+ * rhltable_walk_enter - Initialise an iterator
+ * @hlt:	Table to walk over
+ * @iter:	Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+static inline void rhltable_walk_enter(struct rhltable *hlt,
+				       struct rhashtable_iter *iter)
+{
+	return rhashtable_walk_enter(&hlt->ht, iter);
+}
+
+/**
+ * rhltable_free_and_destroy - free elements and destroy hash list table
+ * @hlt:	the hash list table to destroy
+ * @free_fn:	callback to release resources of element
+ * @arg:	pointer passed to free_fn
+ *
+ * See documentation for rhashtable_free_and_destroy.
+ */
+static inline void rhltable_free_and_destroy(struct rhltable *hlt,
+					     void (*free_fn)(void *ptr,
+							     void *arg),
+					     void *arg)
+{
+	return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
+}
+
+static inline void rhltable_destroy(struct rhltable *hlt)
+{
+	return rhltable_free_and_destroy(hlt, NULL, NULL);
+}
+
 #endif /* _LINUX_RHASHTABLE_H */
diff --git a/include/linux/six.h b/include/linux/six.h
index 0e6df059..477c33eb 100644
--- a/include/linux/six.h
+++ b/include/linux/six.h
@@ -196,6 +196,7 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
 
 void six_lock_wakeup_all(struct six_lock *);
 
+void six_lock_pcpu_free_rcu(struct six_lock *);
 void six_lock_pcpu_free(struct six_lock *);
 void six_lock_pcpu_alloc(struct six_lock *);
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index b8a1235b..775b7e3a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -66,6 +66,7 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags)
 #define kzfree(p)			free(p)
 
 #define kvmalloc(size, flags)		kmalloc(size, flags)
+#define kvzalloc(size, flags)		kzalloc(size, flags)
 #define kvfree(p)			kfree(p)
 
 static inline struct page *alloc_pages(gfp_t flags, unsigned int order)
diff --git a/include/linux/types.h b/include/linux/types.h
index 1e125550..c9886cba 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -11,6 +11,8 @@
 #define __SANE_USERSPACE_TYPES__	/* For PPC64, to get LL64 types */
 #include <asm/types.h>
 
+#include <linux/cache.h>
+
 #define BITS_PER_LONG	__BITS_PER_LONG
 
 struct page;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 532f23b9..cb225951 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -138,19 +138,18 @@ struct bpos {
 #define KEY_SNAPSHOT_MAX		((__u32)~0U)
 #define KEY_SIZE_MAX			((__u32)~0U)
 
-static inline struct bpos POS(__u64 inode, __u64 offset)
+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
 {
-	struct bpos ret;
-
-	ret.inode	= inode;
-	ret.offset	= offset;
-	ret.snapshot	= 0;
-
-	return ret;
+	return (struct bpos) {
+		.inode		= inode,
+		.offset		= offset,
+		.snapshot	= snapshot,
+	};
 }
 
-#define POS_MIN				POS(0, 0)
-#define POS_MAX				POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
+#define POS_MIN				SPOS(0, 0, 0)
+#define POS_MAX				SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS(_inode, _offset)		SPOS(_inode, _offset, 0)
 
 /* Empty placeholder struct, for container_of() */
 struct bch_val {
@@ -707,7 +706,9 @@ struct bch_inode_generation {
 	x(bi_foreground_target,		16)	\
 	x(bi_background_target,		16)	\
 	x(bi_erasure_code,		16)	\
-	x(bi_fields_set,		16)
+	x(bi_fields_set,		16)	\
+	x(bi_dir,			64)	\
+	x(bi_dir_offset,		64)
 
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()			\
@@ -743,6 +744,7 @@ enum {
 	__BCH_INODE_I_SIZE_DIRTY= 5,
 	__BCH_INODE_I_SECTORS_DIRTY= 6,
 	__BCH_INODE_UNLINKED	= 7,
+	__BCH_INODE_BACKPTR_UNTRUSTED = 8,
 
 	/* bits 20+ reserved for packed fields below: */
 };
@@ -755,6 +757,7 @@ enum {
 #define BCH_INODE_I_SIZE_DIRTY	(1 << __BCH_INODE_I_SIZE_DIRTY)
 #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
 #define BCH_INODE_UNLINKED	(1 << __BCH_INODE_UNLINKED)
+#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
 
 LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
 LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 31);
@@ -1204,7 +1207,9 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_new_versioning	= 10,
 	bcachefs_metadata_version_bkey_renumber		= 10,
 	bcachefs_metadata_version_inode_btree_change	= 11,
-	bcachefs_metadata_version_max			= 12,
+	bcachefs_metadata_version_snapshot		= 12,
+	bcachefs_metadata_version_inode_backpointers	= 13,
+	bcachefs_metadata_version_max			= 14,
 };
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
@@ -1736,7 +1741,7 @@ struct btree_node {
 	/* Closed interval: */
 	struct bpos		min_key;
 	struct bpos		max_key;
-	struct bch_extent_ptr	ptr;
+	struct bch_extent_ptr	_ptr; /* not used anymore */
 	struct bkey_format	format;
 
 	union {
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index e1906f25..3af56062 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -614,15 +614,19 @@ const char *bch2_bkey_format_validate(struct bkey_format *f)
 		return "incorrect number of fields";
 
 	for (i = 0; i < f->nr_fields; i++) {
+		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+		u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
 		u64 field_offset = le64_to_cpu(f->field_offset[i]);
 
-		if (f->bits_per_field[i] > 64)
+		if (f->bits_per_field[i] > unpacked_bits)
 			return "field too large";
 
-		if (field_offset &&
-		    (f->bits_per_field[i] == 64 ||
-		    (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
-		     field_offset)))
+		if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+			return "offset + bits overflow";
+
+		if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+		     unpacked_mask) <
+		    field_offset)
 			return "offset + bits overflow";
 
 		bits += f->bits_per_field[i];
@@ -1045,7 +1049,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
 			      high_word(f, r),
 			      b->nr_key_bits);
 
-	EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l),
+	EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
 				bkey_unpack_pos(b, r)));
 	return ret;
 }
@@ -1055,7 +1059,7 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
 					       const struct bkey_packed *l,
 					       const struct bpos *r)
 {
-	return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+	return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
 }
 
 __pure __flatten
@@ -1076,7 +1080,7 @@ int bch2_bkey_cmp_packed(const struct btree *b,
 		r = (void*) &unpacked;
 	}
 
-	return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
 }
 
 __pure __flatten
@@ -1087,7 +1091,7 @@ int __bch2_bkey_cmp_left_packed(const struct btree *b,
 	const struct bkey *l_unpacked;
 
 	return unlikely(l_unpacked = packed_to_bkey_c(l))
-		? bkey_cmp(l_unpacked->p, *r)
+		? bpos_cmp(l_unpacked->p, *r)
 		: __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
 }
 
@@ -1123,11 +1127,12 @@ void bch2_bkey_pack_test(void)
 	struct bkey_packed p;
 
 	struct bkey_format test_format = {
-		.key_u64s	= 2,
+		.key_u64s	= 3,
 		.nr_fields	= BKEY_NR_FIELDS,
 		.bits_per_field = {
 			13,
 			64,
+			32,
 		},
 	};
 
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index 629288a6..2e45d88f 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -33,16 +33,6 @@ struct bkey_s {
 
 #define bkey_next(_k)		vstruct_next(_k)
 
-static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k,
-						       struct bkey_packed *end)
-{
-	k = bkey_next(k);
-
-	while (k != end && !k->u64s)
-		k = (void *) ((u64 *) k + 1);
-	return k;
-}
-
 #define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
 
 static inline size_t bkey_val_bytes(const struct bkey *k)
@@ -150,29 +140,27 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b,
 	return bkey_cmp_left_packed(b, l, &r);
 }
 
-#if 1
+static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
+{
+	return  cmp_int(l.inode,    r.inode) ?:
+		cmp_int(l.offset,   r.offset) ?:
+		cmp_int(l.snapshot, r.snapshot);
+}
+
 static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
 {
-	if (l.inode != r.inode)
-		return l.inode < r.inode ? -1 : 1;
-	if (l.offset != r.offset)
-		return l.offset < r.offset ? -1 : 1;
-	if (l.snapshot != r.snapshot)
-		return l.snapshot < r.snapshot ? -1 : 1;
-	return 0;
+	return  cmp_int(l.inode,    r.inode) ?:
+		cmp_int(l.offset,   r.offset);
 }
-#else
-int bkey_cmp(struct bpos l, struct bpos r);
-#endif
 
 static inline struct bpos bpos_min(struct bpos l, struct bpos r)
 {
-	return bkey_cmp(l, r) < 0 ? l : r;
+	return bpos_cmp(l, r) < 0 ? l : r;
 }
 
 static inline struct bpos bpos_max(struct bpos l, struct bpos r)
 {
-	return bkey_cmp(l, r) > 0 ? l : r;
+	return bpos_cmp(l, r) > 0 ? l : r;
 }
 
 #define sbb(a, b, borrow)				\
@@ -200,7 +188,7 @@ static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
 
 static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
 {
-	if (bkey_cmp(l, r) > 0)
+	if (bpos_cmp(l, r) > 0)
 		swap(l, r);
 
 	return bpos_sub(r, l);
@@ -262,24 +250,46 @@ static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
 		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
 }
 
-static inline struct bpos bkey_successor(struct bpos p)
+static inline struct bpos bpos_successor(struct bpos p)
 {
-	struct bpos ret = p;
+	if (!++p.snapshot &&
+	    !++p.offset &&
+	    !++p.inode)
+		BUG();
 
-	if (!++ret.offset)
-		BUG_ON(!++ret.inode);
+	return p;
+}
 
-	return ret;
+static inline struct bpos bpos_predecessor(struct bpos p)
+{
+	if (!p.snapshot-- &&
+	    !p.offset-- &&
+	    !p.inode--)
+		BUG();
+
+	return p;
 }
 
-static inline struct bpos bkey_predecessor(struct bpos p)
+static inline struct bpos bpos_nosnap_successor(struct bpos p)
 {
-	struct bpos ret = p;
+	p.snapshot = 0;
 
-	if (!ret.offset--)
-		BUG_ON(!ret.inode--);
+	if (!++p.offset &&
+	    !++p.inode)
+		BUG();
 
-	return ret;
+	return p;
+}
+
+static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
+{
+	p.snapshot = 0;
+
+	if (!p.offset-- &&
+	    !p.inode--)
+		BUG();
+
+	return p;
 }
 
 static inline u64 bkey_start_offset(const struct bkey *k)
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 641169ef..6fe95b80 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -119,10 +119,17 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 			return "nonzero size field";
 	}
 
-	if (k.k->p.snapshot)
+	if (type != BKEY_TYPE_btree &&
+	    !btree_type_has_snapshots(type) &&
+	    k.k->p.snapshot)
 		return "nonzero snapshot";
 
 	if (type != BKEY_TYPE_btree &&
+	    btree_type_has_snapshots(type) &&
+	    k.k->p.snapshot != U32_MAX)
+		return "invalid snapshot field";
+
+	if (type != BKEY_TYPE_btree &&
 	    !bkey_cmp(k.k->p, POS_MAX))
 		return "POS_MAX key";
 
@@ -138,10 +145,10 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 {
-	if (bkey_cmp(k.k->p, b->data->min_key) < 0)
+	if (bpos_cmp(k.k->p, b->data->min_key) < 0)
 		return "key before start of btree node";
 
-	if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+	if (bpos_cmp(k.k->p, b->data->max_key) > 0)
 		return "key past end of btree node";
 
 	return NULL;
@@ -165,9 +172,9 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 {
-	if (!bkey_cmp(pos, POS_MIN))
+	if (!bpos_cmp(pos, POS_MIN))
 		pr_buf(out, "POS_MIN");
-	else if (!bkey_cmp(pos, POS_MAX))
+	else if (!bpos_cmp(pos, POS_MAX))
 		pr_buf(out, "POS_MAX");
 	else {
 		if (pos.inode == U64_MAX)
@@ -256,7 +263,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c,
 	    !ops->key_merge ||
 	    l.k->type != r.k->type ||
 	    bversion_cmp(l.k->version, r.k->version) ||
-	    bkey_cmp(l.k->p, bkey_start_pos(r.k)))
+	    bpos_cmp(l.k->p, bkey_start_pos(r.k)))
 		return BCH_MERGE_NOMERGE;
 
 	ret = ops->key_merge(c, l, r);
@@ -310,14 +317,15 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 	const struct bkey_ops *ops;
 	struct bkey uk;
 	struct bkey_s u;
+	unsigned nr_compat = 5;
 	int i;
 
 	/*
 	 * Do these operations in reverse order in the write path:
 	 */
 
-	for (i = 0; i < 4; i++)
-	switch (!write ? i : 3 - i) {
+	for (i = 0; i < nr_compat; i++)
+	switch (!write ? i : nr_compat - 1 - i) {
 	case 0:
 		if (big_endian != CPU_BIG_ENDIAN)
 			bch2_bkey_swab_key(f, k);
@@ -351,6 +359,28 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 		}
 		break;
 	case 3:
+		if (version < bcachefs_metadata_version_snapshot &&
+		    (level || btree_type_has_snapshots(btree_id))) {
+			struct bkey_i *u = packed_to_bkey(k);
+
+			if (u) {
+				u->k.p.snapshot = write
+					? 0 : U32_MAX;
+			} else {
+				u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT];
+				u64 max_packed = min_packed +
+					~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+				uk = __bch2_bkey_unpack_key(f, k);
+				uk.p.snapshot = write
+					? min_packed : min_t(u64, U32_MAX, max_packed);
+
+				BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
+			}
+		}
+
+		break;
+	case 4:
 		if (!bkey_packed(k)) {
 			u = bkey_i_to_s(packed_to_bkey(k));
 		} else {
diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c
index f2507079..537ab791 100644
--- a/libbcachefs/bkey_sort.c
+++ b/libbcachefs/bkey_sort.c
@@ -45,7 +45,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
 
 	BUG_ON(!iter->used);
 
-	i->k = bkey_next_skip_noops(i->k, i->end);
+	i->k = bkey_next(i->k);
 
 	BUG_ON(i->k > i->end);
 
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 87f951e1..3fb9a9ed 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -78,7 +78,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
 	for (_k = i->start;
 	     _k < vstruct_last(i);
 	     _k = _n) {
-		_n = bkey_next_skip_noops(_k, vstruct_last(i));
+		_n = bkey_next(_k);
 
 		k = bkey_disassemble(b, _k, &uk);
 		if (c)
@@ -93,13 +93,13 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
 
 		n = bkey_unpack_key(b, _n);
 
-		if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) {
+		if (bpos_cmp(n.p, k.k->p) < 0) {
 			printk(KERN_ERR "Key skipped backwards\n");
 			continue;
 		}
 
 		if (!bkey_deleted(k.k) &&
-		    !bkey_cmp(n.p, k.k->p))
+		    !bpos_cmp(n.p, k.k->p))
 			printk(KERN_ERR "Duplicate keys\n");
 	}
 }
@@ -534,7 +534,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
 	goto start;
 	while (1) {
 		if (rw_aux_to_bkey(b, t, j) == k) {
-			BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
+			BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k,
 					bkey_unpack_pos(b, k)));
 start:
 			if (++j == t->size)
@@ -544,7 +544,7 @@ start:
 			       rw_aux_tree(b, t)[j - 1].offset);
 		}
 
-		k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+		k = bkey_next(k);
 		BUG_ON(k >= btree_bkey_last(b, t));
 	}
 }
@@ -686,16 +686,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 
 	if (is_power_of_2(j) &&
 	    !min_key->u64s) {
-		k = (void *) min_key;
-		bkey_init(&k->k);
-		k->k.p = b->data->min_key;
+		if (!bkey_pack_pos(min_key, b->data->min_key, b)) {
+			k = (void *) min_key;
+			bkey_init(&k->k);
+			k->k.p = b->data->min_key;
+		}
 	}
 
 	if (is_power_of_2(j + 1) &&
 	    !max_key->u64s) {
-		k = (void *) max_key;
-		bkey_init(&k->k);
-		k->k.p = t->max_key;
+		if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
+			k = (void *) max_key;
+			bkey_init(&k->k);
+			k->k.p = t->max_key;
+		}
 	}
 
 	__make_bfloat(b, t, j, min_key, max_key);
@@ -759,7 +763,7 @@ retry:
 	/* First we figure out where the first key in each cacheline is */
 	eytzinger1_for_each(j, t->size) {
 		while (bkey_to_cacheline(b, t, k) < cacheline)
-			prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+			prev = k, k = bkey_next(k);
 
 		if (k >= btree_bkey_last(b, t)) {
 			/* XXX: this path sucks */
@@ -776,14 +780,19 @@ retry:
 	}
 
 	while (k != btree_bkey_last(b, t))
-		prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+		prev = k, k = bkey_next(k);
 
 	t->max_key = bkey_unpack_pos(b, prev);
 
-	bkey_init(&min_key.k);
-	min_key.k.p = b->data->min_key;
-	bkey_init(&max_key.k);
-	max_key.k.p = t->max_key;
+	if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
+		bkey_init(&min_key.k);
+		min_key.k.p = b->data->min_key;
+	}
+
+	if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
+		bkey_init(&max_key.k);
+		max_key.k.p = t->max_key;
+	}
 
 	/* Then we build the tree */
 	eytzinger1_for_each(j, t->size)
@@ -911,7 +920,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 	struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
 
 	while ((p = __bkey_prev(b, t, k)) && !ret) {
-		for (i = p; i != k; i = bkey_next_skip_noops(i, k))
+		for (i = p; i != k; i = bkey_next(i))
 			if (i->type >= min_key_type)
 				ret = i;
 
@@ -922,10 +931,10 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 		BUG_ON(ret >= orig_k);
 
 		for (i = ret
-			? bkey_next_skip_noops(ret, orig_k)
+			? bkey_next(ret)
 			: btree_bkey_first(b, t);
 		     i != orig_k;
-		     i = bkey_next_skip_noops(i, orig_k))
+		     i = bkey_next(i))
 			BUG_ON(i->type >= min_key_type);
 	}
 
@@ -960,7 +969,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
 	/* signal to make_bfloat() that they're uninitialized: */
 	min_key.u64s = max_key.u64s = 0;
 
-	if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) {
+	if (bkey_next(k) == btree_bkey_last(b, t)) {
 		t->max_key = bkey_unpack_pos(b, k);
 
 		for (j = 1; j < t->size; j = j * 2 + 1)
@@ -1084,7 +1093,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
 		struct bkey_packed *k = start;
 
 		while (1) {
-			k = bkey_next_skip_noops(k, end);
+			k = bkey_next(k);
 			if (k == end)
 				break;
 
@@ -1170,15 +1179,14 @@ void bch2_bset_delete(struct btree *b,
 __flatten
 static struct bkey_packed *bset_search_write_set(const struct btree *b,
 				struct bset_tree *t,
-				struct bpos *search,
-				const struct bkey_packed *packed_search)
+				struct bpos *search)
 {
 	unsigned l = 0, r = t->size;
 
 	while (l + 1 != r) {
 		unsigned m = (l + r) >> 1;
 
-		if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
+		if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
 			l = m;
 		else
 			r = m;
@@ -1238,9 +1246,6 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
 			prefetch(&base->f[n << 4]);
 
 		f = &base->f[n];
-
-		if (!unlikely(packed_search))
-			goto slowpath;
 		if (unlikely(f->exponent >= BFLOAT_FAILED))
 			goto slowpath;
 
@@ -1304,7 +1309,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
 	case BSET_NO_AUX_TREE:
 		return btree_bkey_first(b, t);
 	case BSET_RW_AUX_TREE:
-		return bset_search_write_set(b, t, search, lossy_packed_search);
+		return bset_search_write_set(b, t, search);
 	case BSET_RO_AUX_TREE:
 		/*
 		 * Each node in the auxiliary search tree covers a certain range
@@ -1313,7 +1318,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
 		 * start and end - handle that here:
 		 */
 
-		if (bkey_cmp(*search, t->max_key) > 0)
+		if (bpos_cmp(*search, t->max_key) > 0)
 			return btree_bkey_last(b, t);
 
 		return bset_search_tree(b, t, search, lossy_packed_search);
@@ -1334,12 +1339,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
 		while (m != btree_bkey_last(b, t) &&
 		       bkey_iter_cmp_p_or_unp(b, m,
 					lossy_packed_search, search) < 0)
-			m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
+			m = bkey_next(m);
 
 	if (!packed_search)
 		while (m != btree_bkey_last(b, t) &&
 		       bkey_iter_pos_cmp(b, m, search) < 0)
-			m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
+			m = bkey_next(m);
 
 	if (bch2_expensive_debug_checks) {
 		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
@@ -1403,16 +1408,15 @@ noinline __flatten __attribute__((cold))
 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
 			      struct btree *b, struct bpos *search)
 {
-	struct bset_tree *t;
+	struct bkey_packed *k;
 
 	trace_bkey_pack_pos_fail(search);
 
-	for_each_bset(b, t)
-		__bch2_btree_node_iter_push(iter, b,
-			bch2_bset_search(b, t, search, NULL, NULL),
-			btree_bkey_last(b, t));
+	bch2_btree_node_iter_init_from_start(iter, b);
 
-	bch2_btree_node_iter_sort(iter, b);
+	while ((k = bch2_btree_node_iter_peek(iter, b)) &&
+	       bkey_iter_pos_cmp(b, k, search) < 0)
+		bch2_btree_node_iter_advance(iter, b);
 }
 
 /**
@@ -1446,7 +1450,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
  *    to the search key is going to have 0 sectors after the search key.
  *
  *    But this does mean that we can't just search for
- *    bkey_successor(start_of_range) to get the first extent that overlaps with
+ *    bpos_successor(start_of_range) to get the first extent that overlaps with
  *    the range we want - if we're unlucky and there's an extent that ends
  *    exactly where we searched, then there could be a deleted key at the same
  *    position and we'd get that when we search instead of the preceding extent
@@ -1464,7 +1468,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 	struct bkey_packed *k[MAX_BSETS];
 	unsigned i;
 
-	EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
+	EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0);
 	bset_aux_tree_verify(b);
 
 	memset(iter, 0, sizeof(*iter));
diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h
index 54b364c8..506da4e0 100644
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@@ -305,7 +305,7 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
 #define bset_tree_for_each_key(_b, _t, _k)				\
 	for (_k = btree_bkey_first(_b, _t);				\
 	     _k != btree_bkey_last(_b, _t);				\
-	     _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t)))
+	     _k = bkey_next(_k))
 
 static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
 {
@@ -378,7 +378,7 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
 	EBUG_ON(r_packed && !bkey_packed(r_packed));
 
 	if (unlikely(!bkey_packed(l)))
-		return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+		return bpos_cmp(packed_to_bkey_c(l)->p, *r);
 
 	if (likely(r_packed))
 		return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
@@ -403,24 +403,6 @@ bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
 	return bch2_bkey_prev_filter(b, t, k, 1);
 }
 
-enum bch_extent_overlap {
-	BCH_EXTENT_OVERLAP_ALL		= 0,
-	BCH_EXTENT_OVERLAP_BACK		= 1,
-	BCH_EXTENT_OVERLAP_FRONT	= 2,
-	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
-							  const struct bkey *m)
-{
-	int cmp1 = bkey_cmp(k->p, m->p) < 0;
-	int cmp2 = bkey_cmp(bkey_start_pos(k),
-			    bkey_start_pos(m)) > 0;
-
-	return (cmp1 << 1) + cmp2;
-}
-
 /* Btree key iteration */
 
 void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index fc76e788..8a4667ba 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -149,7 +149,7 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
 	if (level)
 		six_lock_pcpu_alloc(&b->c.lock);
 	else
-		six_lock_pcpu_free(&b->c.lock);
+		six_lock_pcpu_free_rcu(&b->c.lock);
 
 	mutex_lock(&bc->lock);
 	ret = __bch2_btree_node_hash_insert(bc, b);
@@ -814,9 +814,9 @@ lock_node:
 
 	EBUG_ON(b->c.btree_id != iter->btree_id);
 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-	EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+	EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
 	EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-		bkey_cmp(b->data->min_key,
+		bpos_cmp(b->data->min_key,
 			 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 
 	return b;
@@ -897,9 +897,9 @@ lock_node:
 
 	EBUG_ON(b->c.btree_id != btree_id);
 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-	EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+	EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
 	EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-		bkey_cmp(b->data->min_key,
+		bpos_cmp(b->data->min_key,
 			 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
@@ -1011,7 +1011,7 @@ out:
 		if (sib != btree_prev_sib)
 			swap(n1, n2);
 
-		if (bkey_cmp(bkey_successor(n1->key.k.p),
+		if (bpos_cmp(bpos_successor(n1->key.k.p),
 			     n2->data->min_key)) {
 			char buf1[200], buf2[200];
 
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 6d5ed774..88c549c4 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -64,7 +64,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 	struct bpos node_end	= b->data->max_key;
 	struct bpos expected_start = bkey_deleted(&prev->k->k)
 		? node_start
-		: bkey_successor(prev->k->k.p);
+		: bpos_successor(prev->k->k.p);
 	char buf1[200], buf2[200];
 	bool update_min = false;
 	bool update_max = false;
@@ -81,7 +81,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
 		}
 
-		if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c,
+		if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c,
 				"btree node with incorrect min_key at btree %s level %u:\n"
 				"  prev %s\n"
 				"  cur %s",
@@ -92,7 +92,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 	}
 
 	if (fsck_err_on(is_last &&
-			bkey_cmp(cur.k->k.p, node_end), c,
+			bpos_cmp(cur.k->k.p, node_end), c,
 			"btree node with incorrect max_key at btree %s level %u:\n"
 			"  %s\n"
 			"  expected %s",
@@ -470,8 +470,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 	bkey_init(&prev.k->k);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
-		BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
+		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
 
 		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
 				       k, &max_stale, true);
@@ -560,13 +560,13 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 		return 0;
 
 	six_lock_read(&b->c.lock, NULL, NULL);
-	if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c,
+	if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
 			"btree root with incorrect min_key: %s",
 			(bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
 		BUG();
 	}
 
-	if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c,
+	if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
 			"btree root with incorrect max_key: %s",
 			(bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
 		BUG();
@@ -1148,7 +1148,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
-				   BTREE_ITER_PREFETCH);
+				   BTREE_ITER_PREFETCH|
+				   BTREE_ITER_NOT_EXTENTS|
+				   BTREE_ITER_ALL_SNAPSHOTS);
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
@@ -1171,6 +1173,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 
 		bch2_btree_iter_advance(iter);
 	}
+	bch2_trans_iter_put(&trans, iter);
 
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
@@ -1271,6 +1274,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 	/* Find a format that all keys in @old_nodes can pack into */
 	bch2_bkey_format_init(&format_state);
 
+	/*
+	 * XXX: this won't correctly take it account the new min/max keys:
+	 */
 	for (i = 0; i < nr_old_nodes; i++)
 		__bch2_btree_calc_format(&format_state, old_nodes[i]);
 
@@ -1333,7 +1339,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 		     k < vstruct_last(s2) &&
 		     vstruct_blocks_plus(n1->data, c->block_bits,
 					 u64s + k->u64s) <= blocks;
-		     k = bkey_next_skip_noops(k, vstruct_last(s2))) {
+		     k = bkey_next(k)) {
 			last = k;
 			u64s += k->u64s;
 		}
@@ -1362,7 +1368,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 			n1->key.k.p = n1->data->max_key =
 				bkey_unpack_pos(n1, last);
 
-			n2->data->min_key = bkey_successor(n1->data->max_key);
+			n2->data->min_key = bpos_successor(n1->data->max_key);
 
 			memcpy_u64s(vstruct_last(s1),
 				    s2->start, u64s);
@@ -1405,7 +1411,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 		unsigned j;
 
 		for (j = 0; j < nr_new_nodes; j++)
-			if (!bkey_cmp(old_nodes[i]->key.k.p,
+			if (!bpos_cmp(old_nodes[i]->key.k.p,
 				      new_nodes[j]->key.k.p))
 				goto next;
 
diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h
index c3d02f58..b1362a9f 100644
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@@ -45,13 +45,9 @@ static inline struct gc_pos gc_phase(enum gc_phase phase)
 
 static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 {
-	if (l.phase != r.phase)
-		return l.phase < r.phase ? -1 : 1;
-	if (bkey_cmp(l.pos, r.pos))
-		return bkey_cmp(l.pos, r.pos);
-	if (l.level != r.level)
-		return l.level < r.level ? -1 : 1;
-	return 0;
+	return  cmp_int(l.phase, r.phase) ?:
+		bpos_cmp(l.pos, r.pos) ?:
+		cmp_int(l.level, r.level);
 }
 
 static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 9b74e799..b43d4468 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -32,13 +32,13 @@ static void verify_no_dups(struct btree *b,
 	if (start == end)
 		return;
 
-	for (p = start, k = bkey_next_skip_noops(start, end);
+	for (p = start, k = bkey_next(start);
 	     k != end;
-	     p = k, k = bkey_next_skip_noops(k, end)) {
+	     p = k, k = bkey_next(k)) {
 		struct bkey l = bkey_unpack_key(b, p);
 		struct bkey r = bkey_unpack_key(b, k);
 
-		BUG_ON(bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
+		BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0);
 	}
 #endif
 }
@@ -47,9 +47,7 @@ static void set_needs_whiteout(struct bset *i, int v)
 {
 	struct bkey_packed *k;
 
-	for (k = i->start;
-	     k != vstruct_last(i);
-	     k = bkey_next_skip_noops(k, vstruct_last(i)))
+	for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
 		k->needs_whiteout = v;
 }
 
@@ -213,7 +211,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
 		out = i->start;
 
 		for (k = start; k != end; k = n) {
-			n = bkey_next_skip_noops(k, end);
+			n = bkey_next(k);
 
 			if (!bkey_deleted(k)) {
 				bkey_copy(out, k);
@@ -614,12 +612,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
 			     "incorrect level");
 
-		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
-			u64 *p = (u64 *) &bn->ptr;
-
-			*p = swab64(*p);
-		}
-
 		if (!write)
 			compat_btree_node(b->c.level, b->c.btree_id, version,
 					  BSET_BIG_ENDIAN(i), write, bn);
@@ -633,14 +625,14 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 				b->data->max_key = b->key.k.p;
 			}
 
-			btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
+			btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
 				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
 				     "incorrect min_key: got %s should be %s",
 				     (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
 				     (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
 		}
 
-		btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
+		btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
 			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
 			     "incorrect max key %s",
 			     (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
@@ -754,7 +746,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 		}
 
 		prev = k;
-		k = bkey_next_skip_noops(k, vstruct_last(i));
+		k = bkey_next(k);
 	}
 fsck_err:
 	return ret;
@@ -947,7 +939,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			bp.v->mem_ptr = 0;
 		}
 
-		k = bkey_next_skip_noops(k, vstruct_last(i));
+		k = bkey_next(k);
 	}
 
 	bch2_bset_build_aux_tree(b, b->set, false);
@@ -1327,8 +1319,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
 		return -1;
 
-	ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?:
-		validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
+	ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
+		validate_bset(c, NULL, b, i, sectors, WRITE, false);
 	if (ret) {
 		bch2_inconsistent_error(c);
 		dump_stack();
@@ -1481,7 +1473,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		validate_before_checksum = true;
 
 	/* validate_bset will be modifying: */
-	if (le16_to_cpu(i->version) <= bcachefs_metadata_version_inode_btree_change)
+	if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
 		validate_before_checksum = true;
 
 	/* if we're going to be encrypting, check metadata validity first: */
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 16ce6dff..9c14cd30 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -189,8 +189,8 @@ void bch2_btree_flush_all_writes(struct bch_fs *);
 void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
-				 unsigned version, unsigned big_endian,
-				 int write, struct bkey_format *f)
+				  unsigned version, unsigned big_endian,
+				  int write, struct bkey_format *f)
 {
 	if (version < bcachefs_metadata_version_inode_btree_change &&
 	    btree_id == BTREE_ID_inodes) {
@@ -199,6 +199,16 @@ static inline void compat_bformat(unsigned level, enum btree_id btree_id,
 		swap(f->field_offset[BKEY_FIELD_INODE],
 		     f->field_offset[BKEY_FIELD_OFFSET]);
 	}
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    (level || btree_type_has_snapshots(btree_id))) {
+		u64 max_packed =
+			~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+		f->field_offset[BKEY_FIELD_SNAPSHOT] = write
+			? 0
+			: U32_MAX - max_packed;
+	}
 }
 
 static inline void compat_bpos(unsigned level, enum btree_id btree_id,
@@ -220,18 +230,26 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
 {
 	if (version < bcachefs_metadata_version_inode_btree_change &&
 	    btree_node_type_is_extents(btree_id) &&
-	    bkey_cmp(bn->min_key, POS_MIN) &&
+	    bpos_cmp(bn->min_key, POS_MIN) &&
 	    write)
-		bn->min_key = bkey_predecessor(bn->min_key);
+		bn->min_key = bpos_nosnap_predecessor(bn->min_key);
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    write)
+		bn->max_key.snapshot = 0;
 
 	compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
 	compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
 
+	if (version < bcachefs_metadata_version_snapshot &&
+	    !write)
+		bn->max_key.snapshot = U32_MAX;
+
 	if (version < bcachefs_metadata_version_inode_btree_change &&
 	    btree_node_type_is_extents(btree_id) &&
-	    bkey_cmp(bn->min_key, POS_MIN) &&
+	    bpos_cmp(bn->min_key, POS_MIN) &&
 	    !write)
-		bn->min_key = bkey_successor(bn->min_key);
+		bn->min_key = bpos_nosnap_successor(bn->min_key);
 }
 
 #endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 459d27ca..8190e73d 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -18,6 +18,36 @@
 
 static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
 
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
+	EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+	/* Are we iterating over keys in all snapshots? */
+	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+		p = bpos_successor(p);
+	} else {
+		p = bpos_nosnap_successor(p);
+		p.snapshot = iter->snapshot;
+	}
+
+	return p;
+}
+
+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
+{
+	EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+	/* Are we iterating over keys in all snapshots? */
+	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+		p = bpos_predecessor(p);
+	} else {
+		p = bpos_nosnap_predecessor(p);
+		p.snapshot = iter->snapshot;
+	}
+
+	return p;
+}
+
 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
 {
 	return l < BTREE_MAX_DEPTH &&
@@ -30,20 +60,20 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 
 	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
 	    bkey_cmp(pos, POS_MAX))
-		pos = bkey_successor(pos);
+		pos = bkey_successor(iter, pos);
 	return pos;
 }
 
 static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
 					      struct btree *b)
 {
-	return bkey_cmp(iter->real_pos, b->data->min_key) < 0;
+	return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
 }
 
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
 					     struct btree *b)
 {
-	return bkey_cmp(b->key.k.p, iter->real_pos) < 0;
+	return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
 }
 
 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
@@ -285,7 +315,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 
 		/* Must lock btree nodes in key order: */
 		if (btree_node_locked(linked, level) &&
-		    bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
+		    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
 						 btree_iter_type(linked))) <= 0) {
 			deadlock_iter = linked;
 			reason = 7;
@@ -583,10 +613,24 @@ err:
 
 static void bch2_btree_iter_verify(struct btree_iter *iter)
 {
+	enum btree_iter_type type = btree_iter_type(iter);
 	unsigned i;
 
 	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
 
+	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	       iter->pos.snapshot != iter->snapshot);
+
+	BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+	BUG_ON(type == BTREE_ITER_NODES &&
+	       !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+	BUG_ON(type != BTREE_ITER_NODES &&
+	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	       !btree_type_has_snapshots(iter->btree_id));
+
 	bch2_btree_iter_verify_locks(iter);
 
 	for (i = 0; i < BTREE_MAX_DEPTH; i++)
@@ -597,6 +641,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 {
 	enum btree_iter_type type = btree_iter_type(iter);
 
+	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	       iter->pos.snapshot != iter->snapshot);
+
 	BUG_ON((type == BTREE_ITER_KEYS ||
 		type == BTREE_ITER_CACHED) &&
 	       (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
@@ -1384,7 +1431,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	if (!b)
 		return NULL;
 
-	BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+	BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
 
 	iter->pos = iter->real_pos = b->key.k.p;
 
@@ -1421,12 +1468,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	if (!b)
 		return NULL;
 
-	if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
+	if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
 		/*
 		 * Haven't gotten to the end of the parent node: go back down to
 		 * the next child node
 		 */
-		btree_iter_set_search_pos(iter, bkey_successor(iter->pos));
+		btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
 
 		/* Unlock to avoid screwing up our lock invariants: */
 		btree_node_unlock(iter, iter->level);
@@ -1453,7 +1500,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
 {
-	int cmp = bkey_cmp(new_pos, iter->real_pos);
+	int cmp = bpos_cmp(new_pos, iter->real_pos);
 	unsigned l = iter->level;
 
 	if (!cmp)
@@ -1497,10 +1544,10 @@ out:
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
 	struct bpos pos = iter->k.p;
-	bool ret = bkey_cmp(pos, POS_MAX) != 0;
+	bool ret = bpos_cmp(pos, POS_MAX) != 0;
 
 	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-		pos = bkey_successor(pos);
+		pos = bkey_successor(iter, pos);
 	bch2_btree_iter_set_pos(iter, pos);
 	return ret;
 }
@@ -1508,10 +1555,10 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 {
 	struct bpos pos = bkey_start_pos(&iter->k);
-	bool ret = bkey_cmp(pos, POS_MIN) != 0;
+	bool ret = bpos_cmp(pos, POS_MIN) != 0;
 
 	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-		pos = bkey_predecessor(pos);
+		pos = bkey_predecessor(iter, pos);
 	bch2_btree_iter_set_pos(iter, pos);
 	return ret;
 }
@@ -1519,7 +1566,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 {
 	struct bpos next_pos = iter->l[0].b->key.k.p;
-	bool ret = bkey_cmp(next_pos, POS_MAX) != 0;
+	bool ret = bpos_cmp(next_pos, POS_MAX) != 0;
 
 	/*
 	 * Typically, we don't want to modify iter->pos here, since that
@@ -1527,7 +1574,7 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 	 * btree, in that case we want iter->pos to reflect that:
 	 */
 	if (ret)
-		btree_iter_set_search_pos(iter, bkey_successor(next_pos));
+		btree_iter_set_search_pos(iter, bpos_successor(next_pos));
 	else
 		bch2_btree_iter_set_pos(iter, POS_MAX);
 
@@ -1537,10 +1584,10 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 {
 	struct bpos next_pos = iter->l[0].b->data->min_key;
-	bool ret = bkey_cmp(next_pos, POS_MIN) != 0;
+	bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
 
 	if (ret)
-		btree_iter_set_search_pos(iter, bkey_predecessor(next_pos));
+		btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
 	else
 		bch2_btree_iter_set_pos(iter, POS_MIN);
 
@@ -1586,13 +1633,13 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi
 		k = btree_iter_level_peek(iter, &iter->l[0]);
 
 		if (next_update &&
-		    bkey_cmp(next_update->k.p, iter->real_pos) <= 0)
+		    bpos_cmp(next_update->k.p, iter->real_pos) <= 0)
 			k = bkey_i_to_s_c(next_update);
 
 		if (likely(k.k)) {
 			if (bkey_deleted(k.k)) {
 				btree_iter_set_search_pos(iter,
-						bkey_successor(k.k->p));
+						bkey_successor(iter, k.k->p));
 				continue;
 			}
 
@@ -1731,7 +1778,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 		if (iter->pos.inode == KEY_INODE_MAX)
 			return bkey_s_c_null;
 
-		bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
+		bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos));
 	}
 
 	pos = iter->pos;
@@ -1965,6 +2012,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 {
 	struct btree_iter *iter, *best = NULL;
 
+	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+	    !btree_type_has_snapshots(btree_id))
+		flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+
+	if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
+		pos.snapshot = btree_type_has_snapshots(btree_id)
+			? U32_MAX : 0;
+
 	/* We always want a fresh iterator for node iterators: */
 	if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES)
 		goto alloc_iter;
@@ -1999,11 +2054,14 @@ alloc_iter:
 
 	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
 	    btree_node_type_is_extents(btree_id) &&
-	    !(flags & BTREE_ITER_NOT_EXTENTS))
+	    !(flags & BTREE_ITER_NOT_EXTENTS) &&
+	    !(flags & BTREE_ITER_ALL_SNAPSHOTS))
 		flags |= BTREE_ITER_IS_EXTENTS;
 
 	iter->flags = flags;
 
+	iter->snapshot = pos.snapshot;
+
 	if (!(iter->flags & BTREE_ITER_INTENT))
 		bch2_btree_iter_downgrade(iter);
 	else if (!iter->locks_want)
@@ -2026,6 +2084,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 		__bch2_trans_get_iter(trans, btree_id, pos,
 				       BTREE_ITER_NODES|
 				       BTREE_ITER_NOT_EXTENTS|
+				       BTREE_ITER_ALL_SNAPSHOTS|
 				       flags);
 	unsigned i;
 
@@ -2127,6 +2186,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 	trans->nr_updates2		= 0;
 	trans->mem_top			= 0;
 
+	trans->hooks			= NULL;
 	trans->extra_journal_entries	= NULL;
 	trans->extra_journal_entry_u64s	= 0;
 
@@ -2137,7 +2197,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 		       (void *) &trans->fs_usage_deltas->memset_start);
 	}
 
-	bch2_trans_cond_resched(trans);
+	if (!(flags & TRANS_RESET_NOUNLOCK))
+		bch2_trans_cond_resched(trans);
 
 	if (!(flags & TRANS_RESET_NOTRAVERSE))
 		bch2_btree_iter_traverse_all(trans);
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 8768f4cb..7585f989 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -172,6 +172,9 @@ bool bch2_btree_iter_rewind(struct btree_iter *);
 
 static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
+	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+		new_pos.snapshot = iter->snapshot;
+
 	bkey_init(&iter->k);
 	iter->k.p = iter->pos = new_pos;
 }
@@ -303,6 +306,7 @@ static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btr
 }
 
 #define TRANS_RESET_NOTRAVERSE		(1 << 0)
+#define TRANS_RESET_NOUNLOCK		(1 << 1)
 
 void bch2_trans_reset(struct btree_trans *, unsigned);
 
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 0b354563..04354f56 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -21,7 +21,7 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
 	const struct bkey_cached_key *key = arg->key;
 
 	return cmp_int(ck->key.btree_id, key->btree_id) ?:
-		bkey_cmp(ck->key.pos, key->pos);
+		bpos_cmp(ck->key.pos, key->pos);
 }
 
 static const struct rhashtable_params bch2_btree_key_cache_params = {
@@ -70,7 +70,7 @@ static void bkey_cached_evict(struct btree_key_cache *c,
 				      bch2_btree_key_cache_params));
 	memset(&ck->key, ~0, sizeof(ck->key));
 
-	c->nr_keys--;
+	atomic_long_dec(&c->nr_keys);
 }
 
 static void bkey_cached_free(struct btree_key_cache *bc,
@@ -99,12 +99,6 @@ bkey_cached_alloc(struct btree_key_cache *c)
 {
 	struct bkey_cached *ck;
 
-	list_for_each_entry_reverse(ck, &c->freed, list)
-		if (bkey_cached_lock_for_evict(ck)) {
-			c->nr_freed--;
-			return ck;
-		}
-
 	ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
 	if (likely(ck)) {
 		INIT_LIST_HEAD(&ck->list);
@@ -114,11 +108,39 @@ bkey_cached_alloc(struct btree_key_cache *c)
 		return ck;
 	}
 
-	list_for_each_entry(ck, &c->clean, list)
+	return NULL;
+}
+
+static struct bkey_cached *
+bkey_cached_reuse(struct btree_key_cache *c)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct bkey_cached *ck;
+	unsigned i;
+
+	mutex_lock(&c->lock);
+	list_for_each_entry_reverse(ck, &c->freed, list)
 		if (bkey_cached_lock_for_evict(ck)) {
-			bkey_cached_evict(c, ck);
+			c->nr_freed--;
+			list_del(&ck->list);
+			mutex_unlock(&c->lock);
 			return ck;
 		}
+	mutex_unlock(&c->lock);
+
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+	for (i = 0; i < tbl->size; i++)
+		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+			    bkey_cached_lock_for_evict(ck)) {
+				bkey_cached_evict(c, ck);
+				rcu_read_unlock();
+				return ck;
+			}
+		}
+	rcu_read_unlock();
 
 	return NULL;
 }
@@ -129,10 +151,17 @@ btree_key_cache_create(struct btree_key_cache *c,
 		       struct bpos pos)
 {
 	struct bkey_cached *ck;
+	bool was_new = true;
 
 	ck = bkey_cached_alloc(c);
-	if (!ck)
-		return ERR_PTR(-ENOMEM);
+
+	if (unlikely(!ck)) {
+		ck = bkey_cached_reuse(c);
+		if (unlikely(!ck))
+			return ERR_PTR(-ENOMEM);
+
+		was_new = false;
+	}
 
 	ck->c.level		= 0;
 	ck->c.btree_id		= btree_id;
@@ -141,17 +170,26 @@ btree_key_cache_create(struct btree_key_cache *c,
 	ck->valid		= false;
 	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
 
-	if (rhashtable_lookup_insert_fast(&c->table,
+	if (unlikely(rhashtable_lookup_insert_fast(&c->table,
 					  &ck->hash,
-					  bch2_btree_key_cache_params)) {
+					  bch2_btree_key_cache_params))) {
 		/* We raced with another fill: */
-		bkey_cached_free(c, ck);
+
+		if (likely(was_new)) {
+			six_unlock_write(&ck->c.lock);
+			six_unlock_intent(&ck->c.lock);
+			kfree(ck);
+		} else {
+			mutex_lock(&c->lock);
+			bkey_cached_free(c, ck);
+			mutex_unlock(&c->lock);
+		}
+
 		return NULL;
 	}
 
-	c->nr_keys++;
+	atomic_long_inc(&c->nr_keys);
 
-	list_move(&ck->list, &c->clean);
 	six_unlock_write(&ck->c.lock);
 
 	return ck;
@@ -213,7 +251,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p)
 	const struct btree_iter *iter = p;
 
 	return ck->key.btree_id == iter->btree_id &&
-		!bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
+		!bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1;
 }
 
 __flatten
@@ -238,11 +276,8 @@ retry:
 			return 0;
 		}
 
-		mutex_lock(&c->btree_key_cache.lock);
 		ck = btree_key_cache_create(&c->btree_key_cache,
 					    iter->btree_id, iter->pos);
-		mutex_unlock(&c->btree_key_cache.lock);
-
 		ret = PTR_ERR_OR_ZERO(ck);
 		if (ret)
 			goto err;
@@ -257,7 +292,7 @@ retry:
 		if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
 				     bkey_cached_check_fn, iter, _THIS_IP_)) {
 			if (ck->key.btree_id != iter->btree_id ||
-			    bkey_cmp(ck->key.pos, iter->pos)) {
+			    bpos_cmp(ck->key.pos, iter->pos)) {
 				goto retry;
 			}
 
@@ -267,7 +302,7 @@ retry:
 		}
 
 		if (ck->key.btree_id != iter->btree_id ||
-		    bkey_cmp(ck->key.pos, iter->pos)) {
+		    bpos_cmp(ck->key.pos, iter->pos)) {
 			six_unlock_type(&ck->c.lock, lock_want);
 			goto retry;
 		}
@@ -370,15 +405,13 @@ err:
 	bch2_journal_pin_drop(j, &ck->journal);
 	bch2_journal_preres_put(j, &ck->res);
 
+	BUG_ON(!btree_node_locked(c_iter, 0));
+
 	if (!evict) {
-		mutex_lock(&c->btree_key_cache.lock);
 		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-			c->btree_key_cache.nr_dirty--;
+			atomic_long_dec(&c->btree_key_cache.nr_dirty);
 		}
-
-		list_move_tail(&ck->list, &c->btree_key_cache.clean);
-		mutex_unlock(&c->btree_key_cache.lock);
 	} else {
 evict:
 		BUG_ON(!btree_node_intent_locked(c_iter, 0));
@@ -388,13 +421,14 @@ evict:
 
 		six_lock_write(&ck->c.lock, NULL, NULL);
 
-		mutex_lock(&c->btree_key_cache.lock);
 		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-			c->btree_key_cache.nr_dirty--;
+			atomic_long_dec(&c->btree_key_cache.nr_dirty);
 		}
 
 		bkey_cached_evict(&c->btree_key_cache, ck);
+
+		mutex_lock(&c->btree_key_cache.lock);
 		bkey_cached_free(&c->btree_key_cache, ck);
 		mutex_unlock(&c->btree_key_cache.lock);
 	}
@@ -475,16 +509,11 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 	ck->valid = true;
 
 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		mutex_lock(&c->btree_key_cache.lock);
-		list_move(&ck->list, &c->btree_key_cache.dirty);
-
 		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
-		c->btree_key_cache.nr_dirty++;
+		atomic_long_inc(&c->btree_key_cache.nr_dirty);
 
 		if (bch2_nr_btree_keys_need_flush(c))
 			kick_reclaim = true;
-
-		mutex_unlock(&c->btree_key_cache.lock);
 	}
 
 	bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
@@ -509,9 +538,11 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 	struct bch_fs *c = container_of(shrink, struct bch_fs,
 					btree_key_cache.shrink);
 	struct btree_key_cache *bc = &c->btree_key_cache;
+	struct bucket_table *tbl;
 	struct bkey_cached *ck, *t;
 	size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
-	unsigned flags;
+	unsigned start, flags;
+	int srcu_idx;
 
 	/* Return -1 if we can't do anything right now */
 	if (sc->gfp_mask & __GFP_FS)
@@ -519,6 +550,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 	else if (!mutex_trylock(&bc->lock))
 		return -1;
 
+	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 	flags = memalloc_nofs_save();
 
 	/*
@@ -540,23 +572,40 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 	if (scanned >= nr)
 		goto out;
 
-	list_for_each_entry_safe(ck, t, &bc->clean, list) {
-		if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
-			clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-		else if (bkey_cached_lock_for_evict(ck)) {
-			bkey_cached_evict(bc, ck);
-			bkey_cached_free(bc, ck);
-		}
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+	if (bc->shrink_iter >= tbl->size)
+		bc->shrink_iter = 0;
+	start = bc->shrink_iter;
 
-		scanned++;
-		if (scanned >= nr) {
-			if (&t->list != &bc->clean)
-				list_move_tail(&bc->clean, &t->list);
-			goto out;
+	do {
+		struct rhash_head *pos, *next;
+
+		rht_for_each_entry_safe(ck, pos, next, tbl, bc->shrink_iter, hash) {
+			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+				continue;
+
+			if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+				clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+			else if (bkey_cached_lock_for_evict(ck)) {
+				bkey_cached_evict(bc, ck);
+				bkey_cached_free(bc, ck);
+			}
+
+			scanned++;
+			if (scanned >= nr)
+				break;
 		}
-	}
+
+		bc->shrink_iter++;
+		if (bc->shrink_iter >= tbl->size)
+			bc->shrink_iter = 0;
+	} while (scanned < nr && bc->shrink_iter != start);
+
+	rcu_read_unlock();
 out:
 	memalloc_nofs_restore(flags);
+	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 	mutex_unlock(&bc->lock);
 
 	return freed;
@@ -569,41 +618,45 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
 					btree_key_cache.shrink);
 	struct btree_key_cache *bc = &c->btree_key_cache;
 
-	return bc->nr_keys;
+	return atomic_long_read(&bc->nr_keys);
 }
 
 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 {
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+	struct bucket_table *tbl;
 	struct bkey_cached *ck, *n;
+	struct rhash_head *pos;
+	unsigned i;
 
 	if (bc->shrink.list.next)
 		unregister_shrinker(&bc->shrink);
 
 	mutex_lock(&bc->lock);
-	list_splice(&bc->dirty, &bc->clean);
 
-	list_for_each_entry_safe(ck, n, &bc->clean, list) {
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+	for (i = 0; i < tbl->size; i++)
+		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+			bkey_cached_evict(bc, ck);
+			list_add(&ck->list, &bc->freed);
+		}
+	rcu_read_unlock();
+
+	list_for_each_entry_safe(ck, n, &bc->freed, list) {
 		cond_resched();
 
 		bch2_journal_pin_drop(&c->journal, &ck->journal);
 		bch2_journal_preres_put(&c->journal, &ck->res);
 
-		kfree(ck->k);
 		list_del(&ck->list);
+		kfree(ck->k);
 		kmem_cache_free(bch2_key_cache, ck);
-		bc->nr_keys--;
 	}
 
-	BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal));
-	BUG_ON(bc->nr_keys);
-
-	list_for_each_entry_safe(ck, n, &bc->freed, list) {
-		cond_resched();
+	BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal));
+	BUG_ON(atomic_long_read(&bc->nr_keys));
 
-		list_del(&ck->list);
-		kmem_cache_free(bch2_key_cache, ck);
-	}
 	mutex_unlock(&bc->lock);
 
 	if (bc->table_init_done)
@@ -614,8 +667,6 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 {
 	mutex_init(&c->lock);
 	INIT_LIST_HEAD(&c->freed);
-	INIT_LIST_HEAD(&c->clean);
-	INIT_LIST_HEAD(&c->dirty);
 }
 
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
@@ -641,8 +692,8 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
 	pr_buf(out, "nr_freed:\t%zu\n",	c->nr_freed);
-	pr_buf(out, "nr_keys:\t%zu\n",	c->nr_keys);
-	pr_buf(out, "nr_dirty:\t%zu\n",	c->nr_dirty);
+	pr_buf(out, "nr_keys:\t%zu\n",	atomic_long_read(&c->nr_keys));
+	pr_buf(out, "nr_dirty:\t%zu\n",	atomic_long_read(&c->nr_dirty));
 }
 
 void bch2_btree_key_cache_exit(void)
diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h
index 2f8b5521..02715cd2 100644
--- a/libbcachefs/btree_key_cache.h
+++ b/libbcachefs/btree_key_cache.h
@@ -3,8 +3,8 @@
 
 static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
 {
-	size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
-	size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
 	size_t max_dirty = 1024 + nr_keys  / 2;
 
 	return max_t(ssize_t, 0, nr_dirty - max_dirty);
@@ -12,8 +12,8 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
 
 static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
 {
-	size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
-	size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
 	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
 
 	return nr_dirty > max_dirty &&
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 5999044a..1941616f 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -216,6 +216,7 @@ enum btree_iter_type {
 #define BTREE_ITER_CACHED_NOFILL	(1 << 9)
 #define BTREE_ITER_CACHED_NOCREATE	(1 << 10)
 #define BTREE_ITER_NOT_EXTENTS		(1 << 11)
+#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
@@ -245,6 +246,8 @@ struct btree_iter {
 	/* what we're searching for/what the iterator actually points to: */
 	struct bpos		real_pos;
 	struct bpos		pos_after_commit;
+	/* When we're filtering by snapshot, the snapshot ID we're looking for: */
+	unsigned		snapshot;
 
 	u16			flags;
 	u8			idx;
@@ -292,13 +295,12 @@ struct btree_key_cache {
 	struct rhashtable	table;
 	bool			table_init_done;
 	struct list_head	freed;
-	struct list_head	clean;
-	struct list_head	dirty;
 	struct shrinker		shrink;
+	unsigned		shrink_iter;
 
 	size_t			nr_freed;
-	size_t			nr_keys;
-	size_t			nr_dirty;
+	atomic_long_t		nr_keys;
+	atomic_long_t		nr_dirty;
 };
 
 struct bkey_cached_key {
@@ -330,7 +332,7 @@ struct bkey_cached {
 struct btree_insert_entry {
 	unsigned		trigger_flags;
 	u8			bkey_type;
-	u8			btree_id;
+	enum btree_id		btree_id:8;
 	u8			level;
 	unsigned		trans_triggers_run:1;
 	unsigned		is_extent:1;
@@ -344,6 +346,14 @@ struct btree_insert_entry {
 #define BTREE_ITER_MAX		32
 #endif
 
+struct btree_trans_commit_hook;
+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
+
+struct btree_trans_commit_hook {
+	btree_trans_commit_hook_fn	*fn;
+	struct btree_trans_commit_hook	*next;
+};
+
 struct btree_trans {
 	struct bch_fs		*c;
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -378,6 +388,7 @@ struct btree_trans {
 	struct btree_insert_entry *updates2;
 
 	/* update path: */
+	struct btree_trans_commit_hook *hooks;
 	struct jset_entry	*extra_journal_entries;
 	unsigned		extra_journal_entry_u64s;
 	struct journal_entry_pin *journal_pin;
@@ -600,6 +611,17 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
 	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
 	 BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 
+#define BTREE_ID_HAS_SNAPSHOTS				\
+	((1U << BTREE_ID_extents)|			\
+	 (1U << BTREE_ID_inodes)|			\
+	 (1U << BTREE_ID_dirents)|			\
+	 (1U << BTREE_ID_xattrs))
+
+static inline bool btree_type_has_snapshots(enum btree_id id)
+{
+	return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
+}
+
 enum btree_trigger_flags {
 	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
 
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index a2513808..4ce12ae2 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -77,6 +77,8 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
 
 int bch2_trans_update(struct btree_trans *, struct btree_iter *,
 		      struct bkey_i *, enum btree_trigger_flags);
+void bch2_trans_commit_hook(struct btree_trans *,
+			    struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *);
 
 /**
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index a661bc0c..19dfc32e 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -50,7 +50,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 			break;
 		bp = bkey_s_c_to_btree_ptr_v2(k);
 
-		if (bkey_cmp(next_node, bp.v->min_key)) {
+		if (bpos_cmp(next_node, bp.v->min_key)) {
 			bch2_dump_btree_node(c, b);
 			panic("expected next min_key %s got %s\n",
 			      (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
@@ -60,7 +60,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 		bch2_btree_node_iter_advance(&iter, b);
 
 		if (bch2_btree_node_iter_end(&iter)) {
-			if (bkey_cmp(k.k->p, b->key.k.p)) {
+			if (bpos_cmp(k.k->p, b->key.k.p)) {
 				bch2_dump_btree_node(c, b);
 				panic("expected end %s got %s\n",
 				      (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
@@ -69,7 +69,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 			break;
 		}
 
-		next_node = bkey_successor(k.k->p);
+		next_node = bpos_successor(k.k->p);
 	}
 #endif
 }
@@ -82,8 +82,6 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
 	struct bset_tree *t;
 	struct bkey uk;
 
-	bch2_bkey_format_add_pos(s, b->data->min_key);
-
 	for_each_bset(b, t)
 		bset_tree_for_each_key(b, t, k)
 			if (!bkey_deleted(k)) {
@@ -97,6 +95,8 @@ static struct bkey_format bch2_btree_calc_format(struct btree *b)
 	struct bkey_format_state s;
 
 	bch2_bkey_format_init(&s);
+	bch2_bkey_format_add_pos(&s, b->data->min_key);
+	bch2_bkey_format_add_pos(&s, b->data->max_key);
 	__bch2_btree_calc_format(&s, b);
 
 	return bch2_bkey_format_done(&s);
@@ -289,7 +289,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 	b->data->flags = 0;
 	SET_BTREE_NODE_ID(b->data, as->btree_id);
 	SET_BTREE_NODE_LEVEL(b->data, level);
-	b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
 
 	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
 		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
@@ -1095,10 +1094,12 @@ static struct btree *__btree_split_node(struct btree_update *as,
 					struct btree *n1,
 					struct btree_iter *iter)
 {
+	struct bkey_format_state s;
 	size_t nr_packed = 0, nr_unpacked = 0;
 	struct btree *n2;
 	struct bset *set1, *set2;
-	struct bkey_packed *k, *prev = NULL;
+	struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
+	struct bpos n1_pos;
 
 	n2 = bch2_btree_node_alloc(as, n1->c.level);
 	bch2_btree_update_add_new_node(as, n2);
@@ -1108,8 +1109,6 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
 	n2->key.k.p = n1->key.k.p;
 
-	btree_node_set_format(n2, n2->data->format);
-
 	set1 = btree_bset_first(n1);
 	set2 = btree_bset_first(n2);
 
@@ -1119,7 +1118,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	 */
 	k = set1->start;
 	while (1) {
-		struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
+		struct bkey_packed *n = bkey_next(k);
 
 		if (n == vstruct_last(set1))
 			break;
@@ -1136,33 +1135,53 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	}
 
 	BUG_ON(!prev);
+	set2_start	= k;
+	set2_end	= vstruct_last(set1);
 
-	btree_set_max(n1, bkey_unpack_pos(n1, prev));
-	btree_set_min(n2, bkey_successor(n1->key.k.p));
-
-	set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
-	set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
-
+	set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
 	set_btree_bset_end(n1, n1->set);
-	set_btree_bset_end(n2, n2->set);
-
-	n2->nr.live_u64s	= le16_to_cpu(set2->u64s);
-	n2->nr.bset_u64s[0]	= le16_to_cpu(set2->u64s);
-	n2->nr.packed_keys	= n1->nr.packed_keys - nr_packed;
-	n2->nr.unpacked_keys	= n1->nr.unpacked_keys - nr_unpacked;
 
 	n1->nr.live_u64s	= le16_to_cpu(set1->u64s);
 	n1->nr.bset_u64s[0]	= le16_to_cpu(set1->u64s);
 	n1->nr.packed_keys	= nr_packed;
 	n1->nr.unpacked_keys	= nr_unpacked;
 
+	n1_pos = bkey_unpack_pos(n1, prev);
+	if (as->c->sb.version < bcachefs_metadata_version_snapshot)
+		n1_pos.snapshot = U32_MAX;
+
+	btree_set_max(n1, n1_pos);
+	btree_set_min(n2, bpos_successor(n1->key.k.p));
+
+	bch2_bkey_format_init(&s);
+	bch2_bkey_format_add_pos(&s, n2->data->min_key);
+	bch2_bkey_format_add_pos(&s, n2->data->max_key);
+
+	for (k = set2_start; k != set2_end; k = bkey_next(k)) {
+		struct bkey uk = bkey_unpack_key(n1, k);
+		bch2_bkey_format_add_key(&s, &uk);
+	}
+
+	n2->data->format = bch2_bkey_format_done(&s);
+	btree_node_set_format(n2, n2->data->format);
+
+	out = set2->start;
+	memset(&n2->nr, 0, sizeof(n2->nr));
+
+	for (k = set2_start; k != set2_end; k = bkey_next(k)) {
+		BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
+				       ? &n1->format : &bch2_bkey_format_current, k));
+		out->format = KEY_FORMAT_LOCAL_BTREE;
+		btree_keys_account_key_add(&n2->nr, 0, out);
+		out = bkey_next(out);
+	}
+
+	set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
+	set_btree_bset_end(n2, n2->set);
+
 	BUG_ON(!set1->u64s);
 	BUG_ON(!set2->u64s);
 
-	memcpy_u64s(set2->start,
-		    vstruct_end(set1),
-		    le16_to_cpu(set2->u64s));
-
 	btree_node_reset_sib_u64s(n1);
 	btree_node_reset_sib_u64s(n2);
 
@@ -1216,7 +1235,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 	i = btree_bset_first(b);
 	src = dst = i->start;
 	while (src != vstruct_last(i)) {
-		n = bkey_next_skip_noops(src, vstruct_last(i));
+		n = bkey_next(src);
 		if (!bkey_deleted(src)) {
 			memmove_u64s_down(dst, src, src->u64s);
 			dst = bkey_next(dst);
@@ -1563,8 +1582,10 @@ retry:
 	}
 
 	bch2_bkey_format_init(&new_s);
-	__bch2_btree_calc_format(&new_s, b);
-	__bch2_btree_calc_format(&new_s, m);
+	bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
+	__bch2_btree_calc_format(&new_s, prev);
+	__bch2_btree_calc_format(&new_s, next);
+	bch2_bkey_format_add_pos(&new_s, next->data->max_key);
 	new_f = bch2_bkey_format_done(&new_s);
 
 	sib_u64s = btree_node_u64s_with_format(b, &new_f) +
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index d9308bd4..67a2c65b 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -26,7 +26,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
 {
 	return   cmp_int(l->btree_id,	r->btree_id) ?:
 		 -cmp_int(l->level,	r->level) ?:
-		 bkey_cmp(l->k->k.p,	r->k->k.p);
+		 bpos_cmp(l->k->k.p,	r->k->k.p);
 }
 
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
@@ -70,8 +70,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 	EBUG_ON(btree_node_just_written(b));
 	EBUG_ON(bset_written(b, btree_bset_last(b)));
 	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-	EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0);
-	EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0);
+	EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
+	EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
 	EBUG_ON(insert->k.u64s >
 		bch_btree_keys_u64s_remaining(iter->trans->c, b));
 	EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
@@ -223,9 +223,17 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 
-	BUG_ON(bch2_debug_check_bkeys &&
-	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type));
-	BUG_ON(bkey_cmp(i->k->k.p, i->iter->real_pos));
+	if (bch2_debug_check_bkeys) {
+		const char *invalid = bch2_bkey_invalid(c,
+				bkey_i_to_s_c(i->k), i->bkey_type);
+		if (invalid) {
+			char buf[200];
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+			panic("invalid bkey %s on insert: %s\n", buf, invalid);
+		}
+	}
+	BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
 	BUG_ON(i->level		!= i->iter->level);
 	BUG_ON(i->btree_id	!= i->iter->btree_id);
 }
@@ -369,6 +377,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_fs_usage *fs_usage = NULL;
 	struct btree_insert_entry *i;
+	struct btree_trans_commit_hook *h;
 	unsigned u64s = 0;
 	bool marking = false;
 	int ret;
@@ -386,6 +395,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	prefetch(&trans->c->journal.flags);
 
+	h = trans->hooks;
+	while (h) {
+		ret = h->fn(trans, h);
+		if (ret)
+			return ret;
+		h = h->next;
+	}
+
 	trans_for_each_update2(trans, i) {
 		/* Multiple inserts might go to same leaf: */
 		if (!same_leaf_as_prev(trans, i))
@@ -556,6 +573,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	if (trans->flags & BTREE_INSERT_NOUNLOCK)
 		trans->nounlock = true;
 
+	if (!(trans->flags & BTREE_INSERT_NOUNLOCK))
 	trans_for_each_update2(trans, i)
 		if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
 		    !same_leaf_as_prev(trans, i))
@@ -826,7 +844,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	struct btree_insert_entry *i = NULL;
 	struct btree_iter *iter;
 	bool trans_trigger_run;
-	unsigned u64s;
+	unsigned u64s, reset_flags = 0;
 	int ret = 0;
 
 	if (!trans->nr_updates)
@@ -940,7 +958,11 @@ out:
 	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&trans->c->writes);
 out_reset:
-	bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0);
+	if (!ret)
+		reset_flags |= TRANS_RESET_NOTRAVERSE;
+	if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK))
+		reset_flags |= TRANS_RESET_NOUNLOCK;
+	bch2_trans_reset(trans, reset_flags);
 
 	return ret;
 err:
@@ -1053,6 +1075,13 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 	return 0;
 }
 
+void bch2_trans_commit_hook(struct btree_trans *trans,
+			    struct btree_trans_commit_hook *h)
+{
+	h->next = trans->hooks;
+	trans->hooks = h;
+}
+
 int __bch2_btree_insert(struct btree_trans *trans,
 			enum btree_id id, struct bkey_i *k)
 {
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index c6d49f44..acf60038 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -222,7 +222,9 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 
 	bch2_trans_init(&trans, i->c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+	iter = bch2_trans_get_iter(&trans, i->id, i->from,
+				   BTREE_ITER_PREFETCH|
+				   BTREE_ITER_ALL_SNAPSHOTS);
 	k = bch2_btree_iter_peek(iter);
 
 	while (k.k && !(err = bkey_err(k))) {
@@ -273,7 +275,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 	if (err)
 		return err;
 
-	if (!i->size || !bkey_cmp(POS_MAX, i->from))
+	if (!i->size || !bpos_cmp(POS_MAX, i->from))
 		return i->ret;
 
 	bch2_trans_init(&trans, i->c, 0, 0);
@@ -289,8 +291,8 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 		 * can't easily correctly restart a btree node traversal across
 		 * all nodes, meh
 		 */
-		i->from = bkey_cmp(POS_MAX, b->key.k.p)
-			? bkey_successor(b->key.k.p)
+		i->from = bpos_cmp(POS_MAX, b->key.k.p)
+			? bpos_successor(b->key.k.p)
 			: b->key.k.p;
 
 		if (!i->size)
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index 592dd80c..cf4ce2e7 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -141,7 +141,7 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 int bch2_dirent_create(struct btree_trans *trans,
 		       u64 dir_inum, const struct bch_hash_info *hash_info,
 		       u8 type, const struct qstr *name, u64 dst_inum,
-		       int flags)
+		       u64 *dir_offset, int flags)
 {
 	struct bkey_i_dirent *dirent;
 	int ret;
@@ -151,8 +151,11 @@ int bch2_dirent_create(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-			     dir_inum, &dirent->k_i, flags);
+	ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+			    dir_inum, &dirent->k_i, flags);
+	*dir_offset = dirent->k.p.offset;
+
+	return ret;
 }
 
 static void dirent_copy_target(struct bkey_i_dirent *dst,
@@ -165,8 +168,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
 int bch2_dirent_rename(struct btree_trans *trans,
 		       u64 src_dir, struct bch_hash_info *src_hash,
 		       u64 dst_dir, struct bch_hash_info *dst_hash,
-		       const struct qstr *src_name, u64 *src_inum,
-		       const struct qstr *dst_name, u64 *dst_inum,
+		       const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
+		       const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
 		       enum bch_rename_mode mode)
 {
 	struct btree_iter *src_iter = NULL, *dst_iter = NULL;
@@ -255,7 +258,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				new_dst->k.p = src_iter->pos;
 				bch2_trans_update(trans, src_iter,
 						  &new_dst->k_i, 0);
-				goto out;
+				goto out_set_offset;
 			} else {
 				/* If we're overwriting, we can't insert new_dst
 				 * at a different slot because it has to
@@ -278,6 +281,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
 
 	bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
 	bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+out_set_offset:
+	*src_offset = new_src->k.p.offset;
+	*dst_offset = new_dst->k.p.offset;
 out:
 	bch2_trans_iter_put(trans, src_iter);
 	bch2_trans_iter_put(trans, dst_iter);
diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h
index 34769371..e1d8ce37 100644
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@@ -31,7 +31,7 @@ static inline unsigned dirent_val_u64s(unsigned len)
 
 int bch2_dirent_create(struct btree_trans *, u64,
 		       const struct bch_hash_info *, u8,
-		       const struct qstr *, u64, int);
+		       const struct qstr *, u64, u64 *, int);
 
 int bch2_dirent_delete_at(struct btree_trans *,
 			  const struct bch_hash_info *,
@@ -46,8 +46,8 @@ enum bch_rename_mode {
 int bch2_dirent_rename(struct btree_trans *,
 		       u64, struct bch_hash_info *,
 		       u64, struct bch_hash_info *,
-		       const struct qstr *, u64 *,
-		       const struct qstr *, u64 *,
+		       const struct qstr *, u64 *, u64 *,
+		       const struct qstr *, u64 *, u64 *,
 		       enum bch_rename_mode);
 
 struct btree_iter *
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 1dba7e99..f712f685 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -873,6 +873,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 		if (ret)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
 
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index a7e04082..b07d3955 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -180,7 +180,8 @@ const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
 		return "value too big";
 
-	if (bp.v->min_key.snapshot)
+	if (c->sb.version < bcachefs_metadata_version_snapshot &&
+	    bp.v->min_key.snapshot)
 		return "invalid min_key.snapshot";
 
 	return bch2_bkey_ptrs_invalid(c, k);
@@ -212,8 +213,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
 	    btree_node_type_is_extents(btree_id) &&
 	    bkey_cmp(bp.v->min_key, POS_MIN))
 		bp.v->min_key = write
-			? bkey_predecessor(bp.v->min_key)
-			: bkey_successor(bp.v->min_key);
+			? bpos_nosnap_predecessor(bp.v->min_key)
+			: bpos_nosnap_successor(bp.v->min_key);
 }
 
 /* KEY_TYPE_extent: */
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index c8069dfb..ccee43a2 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -582,6 +582,24 @@ void bch2_ptr_swab(struct bkey_s);
 
 /* Generic extent code: */
 
+enum bch_extent_overlap {
+	BCH_EXTENT_OVERLAP_ALL		= 0,
+	BCH_EXTENT_OVERLAP_BACK		= 1,
+	BCH_EXTENT_OVERLAP_FRONT	= 2,
+	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+							  const struct bkey *m)
+{
+	int cmp1 = bkey_cmp(k->p, m->p) < 0;
+	int cmp2 = bkey_cmp(bkey_start_pos(k),
+			    bkey_start_pos(m)) > 0;
+
+	return (cmp1 << 1) + cmp2;
+}
+
 int bch2_cut_front_s(struct bpos, struct bkey_s);
 int bch2_cut_back_s(struct bpos, struct bkey_s);
 
diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c
index 503ce192..83c2168c 100644
--- a/libbcachefs/fs-common.c
+++ b/libbcachefs/fs-common.c
@@ -20,8 +20,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *dir_iter = NULL;
+	struct btree_iter *inode_iter = NULL;
 	struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
-	u64 now = bch2_current_time(trans->c);
+	u64 now = bch2_current_time(c);
+	u64 dir_offset = 0;
 	int ret;
 
 	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
@@ -34,7 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 	if (!name)
 		new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-	ret = bch2_inode_create(trans, new_inode);
+	inode_iter = bch2_inode_create(trans, new_inode);
+	ret = PTR_ERR_OR_ZERO(inode_iter);
 	if (ret)
 		goto err;
 
@@ -66,11 +69,20 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 		ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
 					 mode_to_type(new_inode->bi_mode),
 					 name, new_inode->bi_inum,
+					 &dir_offset,
 					 BCH_HASH_SET_MUST_CREATE);
 		if (ret)
 			goto err;
 	}
+
+	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+		new_inode->bi_dir		= dir_u->bi_inum;
+		new_inode->bi_dir_offset	= dir_offset;
+	}
+
+	ret = bch2_inode_write(trans, inode_iter, new_inode);
 err:
+	bch2_trans_iter_put(trans, inode_iter);
 	bch2_trans_iter_put(trans, dir_iter);
 	return ret;
 }
@@ -79,9 +91,11 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 		    u64 inum, struct bch_inode_unpacked *dir_u,
 		    struct bch_inode_unpacked *inode_u, const struct qstr *name)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
 	struct bch_hash_info dir_hash;
-	u64 now = bch2_current_time(trans->c);
+	u64 now = bch2_current_time(c);
+	u64 dir_offset = 0;
 	int ret;
 
 	inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
@@ -92,6 +106,8 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 	inode_u->bi_ctime = now;
 	bch2_inode_nlink_inc(inode_u);
 
+	inode_u->bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+
 	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
 	ret = PTR_ERR_OR_ZERO(dir_iter);
 	if (ret)
@@ -99,12 +115,21 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 
 	dir_u->bi_mtime = dir_u->bi_ctime = now;
 
-	dir_hash = bch2_hash_info_init(trans->c, dir_u);
+	dir_hash = bch2_hash_info_init(c, dir_u);
 
-	ret =   bch2_dirent_create(trans, dir_inum, &dir_hash,
-				  mode_to_type(inode_u->bi_mode),
-				  name, inum, BCH_HASH_SET_MUST_CREATE) ?:
-		bch2_inode_write(trans, dir_iter, dir_u) ?:
+	ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+				 mode_to_type(inode_u->bi_mode),
+				 name, inum, &dir_offset,
+				 BCH_HASH_SET_MUST_CREATE);
+	if (ret)
+		goto err;
+
+	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+		inode_u->bi_dir		= dir_inum;
+		inode_u->bi_dir_offset	= dir_offset;
+	}
+
+	ret =   bch2_inode_write(trans, dir_iter, dir_u) ?:
 		bch2_inode_write(trans, inode_iter, inode_u);
 err:
 	bch2_trans_iter_put(trans, dir_iter);
@@ -117,10 +142,11 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		      struct bch_inode_unpacked *inode_u,
 		      const struct qstr *name)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
 			  *inode_iter = NULL;
 	struct bch_hash_info dir_hash;
-	u64 inum, now = bch2_current_time(trans->c);
+	u64 inum, now = bch2_current_time(c);
 	struct bkey_s_c k;
 	int ret;
 
@@ -129,7 +155,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	dir_hash = bch2_hash_info_init(trans->c, dir_u);
+	dir_hash = bch2_hash_info_init(c, dir_u);
 
 	dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
 						 name, BTREE_ITER_INTENT);
@@ -195,10 +221,12 @@ int bch2_rename_trans(struct btree_trans *trans,
 		      const struct qstr *dst_name,
 		      enum bch_rename_mode mode)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
 	struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
 	struct bch_hash_info src_hash, dst_hash;
-	u64 src_inode, dst_inode, now = bch2_current_time(trans->c);
+	u64 src_inode, src_offset, dst_inode, dst_offset;
+	u64 now = bch2_current_time(c);
 	int ret;
 
 	src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
@@ -207,7 +235,7 @@ int bch2_rename_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	src_hash = bch2_hash_info_init(trans->c, src_dir_u);
+	src_hash = bch2_hash_info_init(c, src_dir_u);
 
 	if (dst_dir != src_dir) {
 		dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
@@ -216,7 +244,7 @@ int bch2_rename_trans(struct btree_trans *trans,
 		if (ret)
 			goto err;
 
-		dst_hash = bch2_hash_info_init(trans->c, dst_dir_u);
+		dst_hash = bch2_hash_info_init(c, dst_dir_u);
 	} else {
 		dst_dir_u = src_dir_u;
 		dst_hash = src_hash;
@@ -225,8 +253,8 @@ int bch2_rename_trans(struct btree_trans *trans,
 	ret = bch2_dirent_rename(trans,
 				 src_dir, &src_hash,
 				 dst_dir, &dst_hash,
-				 src_name, &src_inode,
-				 dst_name, &dst_inode,
+				 src_name, &src_inode, &src_offset,
+				 dst_name, &dst_inode, &dst_offset,
 				 mode);
 	if (ret)
 		goto err;
@@ -245,6 +273,16 @@ int bch2_rename_trans(struct btree_trans *trans,
 			goto err;
 	}
 
+	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+		src_inode_u->bi_dir		= dst_dir_u->bi_inum;
+		src_inode_u->bi_dir_offset	= dst_offset;
+
+		if (mode == BCH_RENAME_EXCHANGE) {
+			dst_inode_u->bi_dir		= src_dir_u->bi_inum;
+			dst_inode_u->bi_dir_offset	= src_offset;
+		}
+	}
+
 	if (mode == BCH_RENAME_OVERWRITE) {
 		if (S_ISDIR(src_inode_u->bi_mode) !=
 		    S_ISDIR(dst_inode_u->bi_mode)) {
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 9dc162f2..62788ae1 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -675,6 +675,39 @@ retry:
 			continue;
 		}
 
+		if (!target.bi_nlink &&
+		    !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+		    (target.bi_dir != k.k->p.inode ||
+		     target.bi_dir_offset != k.k->p.offset) &&
+		    (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+				 "inode %llu has wrong backpointer:\n"
+				 "got       %llu:%llu\n"
+				 "should be %llu:%llu",
+				 d_inum,
+				 target.bi_dir,
+				 target.bi_dir_offset,
+				 k.k->p.inode,
+				 k.k->p.offset) ||
+		     c->opts.version_upgrade)) {
+			struct bkey_inode_buf p;
+
+			target.bi_dir		= k.k->p.inode;
+			target.bi_dir_offset	= k.k->p.offset;
+			bch2_trans_unlock(&trans);
+
+			bch2_inode_pack(c, &p, &target);
+
+			ret = bch2_btree_insert(c, BTREE_ID_inodes,
+						&p.inode.k_i, NULL, NULL,
+						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_LAZY_RW);
+			if (ret) {
+				bch_err(c, "error in fsck: error %i updating inode", ret);
+				goto err;
+			}
+			continue;
+		}
+
 		if (fsck_err_on(have_target &&
 				d.v->d_type !=
 				mode_to_type(target.bi_mode), c,
@@ -1314,6 +1347,16 @@ static int check_inode(struct btree_trans *trans,
 		do_update = true;
 	}
 
+	if (!S_ISDIR(u.bi_mode) &&
+	    u.bi_nlink &&
+	    !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+	    (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+			 "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") ||
+	     c->opts.version_upgrade)) {
+		u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+		do_update = true;
+	}
+
 	if (do_update) {
 		struct bkey_inode_buf p;
 
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 4559e77f..f1665ca8 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -332,6 +332,7 @@ int bch2_inode_write(struct btree_trans *trans,
 		return PTR_ERR(inode_p);
 
 	bch2_inode_pack(trans->c, inode_p, inode);
+	inode_p->inode.k.p.snapshot = iter->snapshot;
 	bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
 	return 0;
 }
@@ -469,11 +470,10 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 	}
 }
 
-int bch2_inode_create(struct btree_trans *trans,
-		      struct bch_inode_unpacked *inode_u)
+struct btree_iter *bch2_inode_create(struct btree_trans *trans,
+				     struct bch_inode_unpacked *inode_u)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_inode_buf *inode_p;
 	struct btree_iter *iter = NULL;
 	struct bkey_s_c k;
 	u64 min, max, start, *hint;
@@ -493,10 +493,6 @@ int bch2_inode_create(struct btree_trans *trans,
 
 	if (start >= max || start < min)
 		start = min;
-
-	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-	if (IS_ERR(inode_p))
-		return PTR_ERR(inode_p);
 again:
 	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
@@ -520,7 +516,7 @@ again:
 	bch2_trans_iter_put(trans, iter);
 
 	if (ret)
-		return ret;
+		return ERR_PTR(ret);
 
 	if (start != min) {
 		/* Retry from start */
@@ -528,15 +524,12 @@ again:
 		goto again;
 	}
 
-	return -ENOSPC;
+	return ERR_PTR(-ENOSPC);
 found_slot:
 	*hint			= k.k->p.offset;
 	inode_u->bi_inum	= k.k->p.offset;
 	inode_u->bi_generation	= bkey_generation(k);
-
-	ret = bch2_inode_write(trans, iter, inode_u);
-	bch2_trans_iter_put(trans, iter);
-	return ret;
+	return iter;
 }
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 1caf036a..6bad6dfb 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -69,7 +69,8 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 		     uid_t, gid_t, umode_t, dev_t,
 		     struct bch_inode_unpacked *);
 
-int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
+struct btree_iter *bch2_inode_create(struct btree_trans *,
+				     struct bch_inode_unpacked *);
 
 int bch2_inode_rm(struct bch_fs *, u64, bool);
 
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 284d398b..36b10cb7 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -322,6 +322,9 @@ int bch2_extent_update(struct btree_trans *trans,
 
 		if (i_sectors_delta || new_i_size) {
 			bch2_inode_pack(trans->c, &inode_p, &inode_u);
+
+			inode_p.inode.k.p.snapshot = iter->snapshot;
+
 			bch2_trans_update(trans, inode_iter,
 					  &inode_p.inode.k_i, 0);
 		}
@@ -437,6 +440,8 @@ int bch2_write_index_default(struct bch_write_op *op)
 
 		k = bch2_keylist_front(keys);
 
+		k->k.p.snapshot = iter->snapshot;
+
 		bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
 		bkey_copy(sk.k, k);
 		bch2_cut_front(iter->pos, sk.k);
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 1f26139d..69c553a6 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -914,14 +914,17 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
 	if (dynamic_fault("bcachefs:add:journal_alloc"))
 		return -ENOMEM;
 
+	/* 1/128th of the device by default: */
+	nr = ca->mi.nbuckets >> 7;
+
 	/*
-	 * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+	 * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
 	 * is smaller:
 	 */
-	nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
+	nr = clamp_t(unsigned, nr,
 		     BCH_JOURNAL_BUCKETS_MIN,
-		     min(1 << 10,
-			 (1 << 20) / ca->mi.bucket_size));
+		     min(1 << 13,
+			 (1 << 24) / ca->mi.bucket_size));
 
 	return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
 }
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 54f2e205..c7fa03cf 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1452,7 +1452,7 @@ void bch2_journal_write(struct closure *cl)
 	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
 		validate_before_checksum = true;
 
-	if (le32_to_cpu(jset->version) <= bcachefs_metadata_version_inode_btree_change)
+	if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
 		validate_before_checksum = true;
 
 	if (validate_before_checksum &&
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index bbf8e5ad..4a5b50ed 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -610,8 +610,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 				j->prereserved.remaining,
 				atomic_read(&c->btree_cache.dirty),
 				c->btree_cache.used,
-				c->btree_key_cache.nr_dirty,
-				c->btree_key_cache.nr_keys);
+				atomic_long_read(&c->btree_key_cache.nr_dirty),
+				atomic_long_read(&c->btree_key_cache.nr_keys));
 
 		nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr);
 
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index f863fd74..3d1bf87e 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -48,14 +48,14 @@ static int __journal_key_cmp(enum btree_id	l_btree_id,
 {
 	return (cmp_int(l_btree_id,	r->btree_id) ?:
 		cmp_int(l_level,	r->level) ?:
-		bkey_cmp(l_pos,	r->k->k.p));
+		bpos_cmp(l_pos,	r->k->k.p));
 }
 
 static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
 {
 	return (cmp_int(l->btree_id,	r->btree_id) ?:
 		cmp_int(l->level,	r->level) ?:
-		bkey_cmp(l->k->k.p,	r->k->k.p));
+		bpos_cmp(l->k->k.p,	r->k->k.p));
 }
 
 static size_t journal_key_search(struct journal_keys *journal_keys,
@@ -90,7 +90,7 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign
 	if (iter->idx > idx ||
 	    (iter->idx == idx &&
 	     biter->last &&
-	     bkey_cmp(n->k.p, biter->unpacked.p) <= 0))
+	     bpos_cmp(n->k.p, biter->unpacked.p) <= 0))
 		iter->idx++;
 }
 
@@ -238,7 +238,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
 			bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
 
 		if (btree_k.k && journal_k.k) {
-			int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
+			int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p);
 
 			if (!cmp)
 				bch2_journal_iter_advance_btree(iter);
@@ -256,7 +256,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
 		ret = iter->last == journal ? journal_k : btree_k;
 
 		if (iter->b &&
-		    bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
+		    bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) {
 			iter->journal.idx = iter->journal.keys->nr;
 			iter->last = none;
 			return bkey_s_c_null;
@@ -419,7 +419,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 
 	return  cmp_int(l->btree_id,	r->btree_id) ?:
 		cmp_int(l->level,	r->level) ?:
-		bkey_cmp(l->k->k.p, r->k->k.p) ?:
+		bpos_cmp(l->k->k.p, r->k->k.p) ?:
 		cmp_int(l->journal_seq, r->journal_seq) ?:
 		cmp_int(l->journal_offset, r->journal_offset);
 }
@@ -490,7 +490,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 		while (src + 1 < keys.d + keys.nr &&
 		       src[0].btree_id	== src[1].btree_id &&
 		       src[0].level	== src[1].level &&
-		       !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
+		       !bpos_cmp(src[0].k->k.p, src[1].k->k.p))
 			src++;
 
 		*dst++ = *src++;
@@ -581,7 +581,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 	return  cmp_int(r->level,	l->level) ?:
 		cmp_int(l->journal_seq, r->journal_seq) ?:
 		cmp_int(l->btree_id,	r->btree_id) ?:
-		bkey_cmp(l->k->k.p,	r->k->k.p);
+		bpos_cmp(l->k->k.p,	r->k->k.p);
 }
 
 static int bch2_journal_replay(struct bch_fs *c,
@@ -998,6 +998,13 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
+	if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) {
+		bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
+		ret = -EINVAL;
+		goto err;
+
+	}
+
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
 		bch_info(c, "alloc_v2 feature bit not set, fsck required");
 		c->opts.fsck = true;
@@ -1338,6 +1345,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
 	root_inode.bi_inum = BCACHEFS_ROOT_INO;
 	bch2_inode_pack(c, &packed_inode, &root_inode);
+	packed_inode.inode.k.p.snapshot = U32_MAX;
 
 	err = "error creating root directory";
 	ret = bch2_btree_insert(c, BTREE_ID_inodes,
diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c
index dfb12fdd..7507b6bc 100644
--- a/libbcachefs/tests.c
+++ b/libbcachefs/tests.c
@@ -67,6 +67,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 		goto err;
 	}
 err:
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -106,6 +107,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 		goto err;
 	}
 err:
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -113,7 +115,7 @@ err:
 static int test_iterate(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter *iter = NULL;
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
@@ -159,6 +161,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 	BUG_ON(i);
 err:
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -166,7 +169,7 @@ err:
 static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter *iter = NULL;
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
@@ -213,6 +216,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	BUG_ON(i);
 err:
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -257,7 +261,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		BUG_ON(k.k->p.offset != i);
 		i += 2;
 	}
-	bch2_trans_iter_free(&trans, iter);
+	bch2_trans_iter_put(&trans, iter);
 
 	BUG_ON(i != nr * 2);
 
@@ -274,6 +278,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		if (i == nr * 2)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
 err:
 	bch2_trans_exit(&trans);
 	return ret;
@@ -318,7 +323,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		BUG_ON(k.k->size != 8);
 		i += 16;
 	}
-	bch2_trans_iter_free(&trans, iter);
+	bch2_trans_iter_put(&trans, iter);
 
 	BUG_ON(i != nr);
 
@@ -337,6 +342,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		if (i == nr)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
 err:
 	bch2_trans_exit(&trans);
 	return 0;
@@ -362,6 +368,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
 	k = bch2_btree_iter_peek(iter);
 	BUG_ON(k.k);
 
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 	return 0;
 }
@@ -382,6 +390,8 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 	k = bch2_btree_iter_peek(iter);
 	BUG_ON(k.k);
 
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 	return 0;
 }
@@ -473,6 +483,7 @@ static int rand_insert(struct bch_fs *c, u64 nr)
 	for (i = 0; i < nr; i++) {
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = test_rand();
+		k.k.p.snapshot = U32_MAX;
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
@@ -508,7 +519,7 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
 		}
 	}
 
-	bch2_trans_iter_free(&trans, iter);
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -549,7 +560,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
 		}
 	}
 
-	bch2_trans_iter_free(&trans, iter);
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -630,6 +641,8 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 		if (++i == nr)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -645,6 +658,8 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
 		;
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -671,6 +686,8 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 			break;
 		}
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 	return ret;
 }
diff --git a/linux/rhashtable.c b/linux/rhashtable.c
index 351eac79..ba2196fc 100644
--- a/linux/rhashtable.c
+++ b/linux/rhashtable.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
@@ -8,27 +9,29 @@
  * Code partially derived from nft_hash
  * Rewritten with rehash code from br_multicast plus single list
  * pointer as suggested by Josh Triplett
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/atomic.h>
-#include <linux/cpumask.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
 #include <linux/sched.h>
+#include <linux/rculist.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/jhash.h>
+#include <linux/overflow.h>
 #include <linux/random.h>
 #include <linux/rhashtable.h>
 #include <linux/err.h>
+#include <linux/export.h>
 
 #define HASH_DEFAULT_SIZE	64UL
 #define HASH_MIN_SIZE		4U
-#define BUCKET_LOCKS_PER_CPU	32UL
+
+union nested_table {
+	union nested_table __rcu *table;
+	struct rhash_lock_head __rcu *bucket;
+};
 
 static u32 head_hashfn(struct rhashtable *ht,
 		       const struct bucket_table *tbl,
@@ -37,40 +40,75 @@ static u32 head_hashfn(struct rhashtable *ht,
 	return rht_head_hashfn(ht, tbl, he, ht->p);
 }
 
-static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
-			      gfp_t gfp)
-{
-	unsigned int i, size;
-	unsigned int nr_pcpus = num_possible_cpus();
+#ifdef CONFIG_PROVE_LOCKING
+#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
 
-	nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL);
-	size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
+{
+	return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 
-	/* Never allocate more than 0.5 locks per bucket */
-	size = min_t(unsigned int, size, tbl->size >> 1);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
+{
+	if (!debug_locks)
+		return 1;
+	if (unlikely(tbl->nest))
+		return 1;
+	return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]);
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
+#else
+#define ASSERT_RHT_MUTEX(HT)
+#endif
 
-	if (sizeof(spinlock_t) != 0) {
-		tbl->locks = NULL;
-		if (gfp != GFP_KERNEL)
-			gfp |= __GFP_NOWARN | __GFP_NORETRY;
+static inline union nested_table *nested_table_top(
+	const struct bucket_table *tbl)
+{
+	/* The top-level bucket entry does not need RCU protection
+	 * because it's set at the same time as tbl->nest.
+	 */
+	return (void *)rcu_dereference_protected(tbl->buckets[0], 1);
+}
 
-		if (!tbl->locks)
-			tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
-						   gfp);
-		if (!tbl->locks)
-			return -ENOMEM;
-		for (i = 0; i < size; i++)
-			spin_lock_init(&tbl->locks[i]);
+static void nested_table_free(union nested_table *ntbl, unsigned int size)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	const unsigned int len = 1 << shift;
+	unsigned int i;
+
+	ntbl = rcu_dereference_protected(ntbl->table, 1);
+	if (!ntbl)
+		return;
+
+	if (size > len) {
+		size >>= shift;
+		for (i = 0; i < len; i++)
+			nested_table_free(ntbl + i, size);
 	}
-	tbl->locks_mask = size - 1;
 
-	return 0;
+	kfree(ntbl);
+}
+
+static void nested_bucket_table_free(const struct bucket_table *tbl)
+{
+	unsigned int size = tbl->size >> tbl->nest;
+	unsigned int len = 1 << tbl->nest;
+	union nested_table *ntbl;
+	unsigned int i;
+
+	ntbl = nested_table_top(tbl);
+
+	for (i = 0; i < len; i++)
+		nested_table_free(ntbl + i, size);
+
+	kfree(ntbl);
 }
 
 static void bucket_table_free(struct bucket_table *tbl)
 {
-	if (tbl)
-		kvfree(tbl->locks);
+	if (tbl->nest)
+		nested_bucket_table_free(tbl);
 
 	kvfree(tbl);
 }
@@ -80,6 +118,59 @@ static void bucket_table_free_rcu(struct rcu_head *head)
 	bucket_table_free(container_of(head, struct bucket_table, rcu));
 }
 
+static union nested_table *nested_table_alloc(struct rhashtable *ht,
+					      union nested_table __rcu **prev,
+					      bool leaf)
+{
+	union nested_table *ntbl;
+	int i;
+
+	ntbl = rcu_dereference(*prev);
+	if (ntbl)
+		return ntbl;
+
+	ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
+
+	if (ntbl && leaf) {
+		for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
+			INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
+	}
+
+	if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL)
+		return ntbl;
+	/* Raced with another thread. */
+	kfree(ntbl);
+	return rcu_dereference(*prev);
+}
+
+static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
+						      size_t nbuckets,
+						      gfp_t gfp)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	struct bucket_table *tbl;
+	size_t size;
+
+	if (nbuckets < (1 << (shift + 1)))
+		return NULL;
+
+	size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
+
+	tbl = kzalloc(size, gfp);
+	if (!tbl)
+		return NULL;
+
+	if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
+				false)) {
+		kfree(tbl);
+		return NULL;
+	}
+
+	tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
+
+	return tbl;
+}
+
 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 					       size_t nbuckets,
 					       gfp_t gfp)
@@ -88,28 +179,27 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 	size_t size;
 	int i;
 
-	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
-	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
-	    gfp != GFP_KERNEL)
-		tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
-	if (tbl == NULL && gfp == GFP_KERNEL)
-		tbl = vzalloc(size);
-	if (tbl == NULL)
-		return NULL;
+	tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp);
 
-	tbl->size = nbuckets;
+	size = nbuckets;
 
-	if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
-		bucket_table_free(tbl);
-		return NULL;
+	if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
+		tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
+		nbuckets = 0;
 	}
 
+	if (tbl == NULL)
+		return NULL;
+
+	tbl->size = size;
+
+	rcu_head_init(&tbl->rcu);
 	INIT_LIST_HEAD(&tbl->walkers);
 
-	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+	tbl->hash_rnd = get_random_u32();
 
 	for (i = 0; i < nbuckets; i++)
-		INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
+		INIT_RHT_NULLS_HEAD(tbl->buckets[i]);
 
 	return tbl;
 }
@@ -127,18 +217,24 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
 	return new_tbl;
 }
 
-static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
+static int rhashtable_rehash_one(struct rhashtable *ht,
+				 struct rhash_lock_head __rcu **bkt,
+				 unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-	struct bucket_table *new_tbl = rhashtable_last_table(ht,
-		rht_dereference_rcu(old_tbl->future_tbl, ht));
-	struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
-	int err = -ENOENT;
+	struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
+	int err = -EAGAIN;
 	struct rhash_head *head, *next, *entry;
-	spinlock_t *new_bucket_lock;
+	struct rhash_head __rcu **pprev = NULL;
 	unsigned int new_hash;
 
-	rht_for_each(entry, old_tbl, old_hash) {
+	if (new_tbl->nest)
+		goto out;
+
+	err = -ENOENT;
+
+	rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash),
+			  old_tbl, old_hash) {
 		err = 0;
 		next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
 
@@ -153,57 +249,58 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 
 	new_hash = head_hashfn(ht, new_tbl, entry);
 
-	new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
+	rht_lock(new_tbl, &new_tbl->buckets[new_hash]);
 
-	spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
-	head = rht_dereference_bucket(new_tbl->buckets[new_hash],
-				      new_tbl, new_hash);
+	head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);
 
 	RCU_INIT_POINTER(entry->next, head);
 
-	rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
-	spin_unlock(new_bucket_lock);
+	rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry);
 
-	rcu_assign_pointer(*pprev, next);
+	if (pprev)
+		rcu_assign_pointer(*pprev, next);
+	else
+		/* Need to preserved the bit lock. */
+		rht_assign_locked(bkt, next);
 
 out:
 	return err;
 }
 
-static void rhashtable_rehash_chain(struct rhashtable *ht,
+static int rhashtable_rehash_chain(struct rhashtable *ht,
 				    unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-	spinlock_t *old_bucket_lock;
+	struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
+	int err;
 
-	old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
+	if (!bkt)
+		return 0;
+	rht_lock(old_tbl, bkt);
 
-	spin_lock_bh(old_bucket_lock);
-	while (!rhashtable_rehash_one(ht, old_hash))
+	while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
 		;
-	old_tbl->rehash++;
-	spin_unlock_bh(old_bucket_lock);
+
+	if (err == -ENOENT)
+		err = 0;
+	rht_unlock(old_tbl, bkt);
+
+	return err;
 }
 
 static int rhashtable_rehash_attach(struct rhashtable *ht,
 				    struct bucket_table *old_tbl,
 				    struct bucket_table *new_tbl)
 {
-	/* Protect future_tbl using the first bucket lock. */
-	spin_lock_bh(old_tbl->locks);
-
-	/* Did somebody beat us to it? */
-	if (rcu_access_pointer(old_tbl->future_tbl)) {
-		spin_unlock_bh(old_tbl->locks);
-		return -EEXIST;
-	}
-
 	/* Make insertions go into the new, empty table right away. Deletions
 	 * and lookups will be attempted in both tables until we synchronize.
+	 * As cmpxchg() provides strong barriers, we do not need
+	 * rcu_assign_pointer().
 	 */
-	rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
 
-	spin_unlock_bh(old_tbl->locks);
+	if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL,
+		    new_tbl) != NULL)
+		return -EEXIST;
 
 	return 0;
 }
@@ -214,13 +311,18 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	struct bucket_table *new_tbl;
 	struct rhashtable_walker *walker;
 	unsigned int old_hash;
+	int err;
 
 	new_tbl = rht_dereference(old_tbl->future_tbl, ht);
 	if (!new_tbl)
 		return 0;
 
-	for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
-		rhashtable_rehash_chain(ht, old_hash);
+	for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+		err = rhashtable_rehash_chain(ht, old_hash);
+		if (err)
+			return err;
+		cond_resched();
+	}
 
 	/* Publish the new table pointer. */
 	rcu_assign_pointer(ht->tbl, new_tbl);
@@ -228,25 +330,30 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	spin_lock(&ht->lock);
 	list_for_each_entry(walker, &old_tbl->walkers, list)
 		walker->tbl = NULL;
-	spin_unlock(&ht->lock);
 
 	/* Wait for readers. All new readers will see the new
 	 * table, and thus no references to the old table will
 	 * remain.
+	 * We do this inside the locked region so that
+	 * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
+	 * to check if it should not re-link the table.
 	 */
 	call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
+	spin_unlock(&ht->lock);
 
 	return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 }
 
-static int rhashtable_expand(struct rhashtable *ht)
+static int rhashtable_rehash_alloc(struct rhashtable *ht,
+				   struct bucket_table *old_tbl,
+				   unsigned int size)
 {
-	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *new_tbl;
 	int err;
 
-	old_tbl = rhashtable_last_table(ht, old_tbl);
+	ASSERT_RHT_MUTEX(ht);
 
-	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
+	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
@@ -257,12 +364,27 @@ static int rhashtable_expand(struct rhashtable *ht)
 	return err;
 }
 
+/**
+ * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
+ * @ht:		the hash table to shrink
+ *
+ * This function shrinks the hash table to fit, i.e., the smallest
+ * size would not cause it to expand right away automatically.
+ *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * The caller must ensure that no concurrent table mutations take place.
+ * It is however valid to have concurrent lookups if they are RCU protected.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
+ */
 static int rhashtable_shrink(struct rhashtable *ht)
 {
-	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	unsigned int nelems = atomic_read(&ht->nelems);
 	unsigned int size = 0;
-	int err;
 
 	if (nelems)
 		size = roundup_pow_of_two(nelems * 3 / 2);
@@ -275,15 +397,7 @@ static int rhashtable_shrink(struct rhashtable *ht)
 	if (rht_dereference(old_tbl->future_tbl, ht))
 		return -EEXIST;
 
-	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
-	if (new_tbl == NULL)
-		return -ENOMEM;
-
-	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
-	if (err)
-		bucket_table_free(new_tbl);
-
-	return err;
+	return rhashtable_rehash_alloc(ht, old_tbl, size);
 }
 
 static void rht_deferred_worker(struct work_struct *work)
@@ -299,11 +413,18 @@ static void rht_deferred_worker(struct work_struct *work)
 	tbl = rhashtable_last_table(ht, tbl);
 
 	if (rht_grow_above_75(ht, tbl))
-		rhashtable_expand(ht);
+		err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
 	else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
-		rhashtable_shrink(ht);
+		err = rhashtable_shrink(ht);
+	else if (tbl->nest)
+		err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
+
+	if (!err || err == -EEXIST) {
+		int nerr;
 
-	err = rhashtable_rehash_table(ht);
+		nerr = rhashtable_rehash_table(ht);
+		err = err ?: nerr;
+	}
 
 	mutex_unlock(&ht->mutex);
 
@@ -311,22 +432,8 @@ static void rht_deferred_worker(struct work_struct *work)
 		schedule_work(&ht->run_work);
 }
 
-static bool rhashtable_check_elasticity(struct rhashtable *ht,
-					struct bucket_table *tbl,
-					unsigned int hash)
-{
-	unsigned int elasticity = ht->elasticity;
-	struct rhash_head *head;
-
-	rht_for_each(head, tbl, hash)
-		if (!--elasticity)
-			return true;
-
-	return false;
-}
-
-int rhashtable_insert_rehash(struct rhashtable *ht,
-			     struct bucket_table *tbl)
+static int rhashtable_insert_rehash(struct rhashtable *ht,
+				    struct bucket_table *tbl)
 {
 	struct bucket_table *old_tbl;
 	struct bucket_table *new_tbl;
@@ -347,7 +454,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht,
 
 	err = -ENOMEM;
 
-	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
+	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
 	if (new_tbl == NULL)
 		goto fail;
 
@@ -363,7 +470,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht,
 
 fail:
 	/* Do not fail the insert if someone else did a rehash. */
-	if (likely(rcu_dereference_raw(tbl->future_tbl)))
+	if (likely(rcu_access_pointer(tbl->future_tbl)))
 		return 0;
 
 	/* Schedule async rehash to retry allocation in process context. */
@@ -373,57 +480,485 @@ fail:
 	return err;
 }
 
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
-					    const void *key,
-					    struct rhash_head *obj,
-					    struct bucket_table *tbl)
+static void *rhashtable_lookup_one(struct rhashtable *ht,
+				   struct rhash_lock_head __rcu **bkt,
+				   struct bucket_table *tbl, unsigned int hash,
+				   const void *key, struct rhash_head *obj)
 {
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
+	struct rhash_head __rcu **pprev = NULL;
 	struct rhash_head *head;
-	unsigned int hash;
-	int err;
+	int elasticity;
+
+	elasticity = RHT_ELASTICITY;
+	rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
+		struct rhlist_head *list;
+		struct rhlist_head *plist;
+
+		elasticity--;
+		if (!key ||
+		    (ht->p.obj_cmpfn ?
+		     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
+		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
+			pprev = &head->next;
+			continue;
+		}
 
-	tbl = rhashtable_last_table(ht, tbl);
-	hash = head_hashfn(ht, tbl, obj);
-	spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
+		if (!ht->rhlist)
+			return rht_obj(ht, head);
 
-	err = -EEXIST;
-	if (key && rhashtable_lookup_fast(ht, key, ht->p))
-		goto exit;
+		list = container_of(obj, struct rhlist_head, rhead);
+		plist = container_of(head, struct rhlist_head, rhead);
 
-	err = -E2BIG;
-	if (unlikely(rht_grow_above_max(ht, tbl)))
-		goto exit;
+		RCU_INIT_POINTER(list->next, plist);
+		head = rht_dereference_bucket(head->next, tbl, hash);
+		RCU_INIT_POINTER(list->rhead.next, head);
+		if (pprev)
+			rcu_assign_pointer(*pprev, obj);
+		else
+			/* Need to preserve the bit lock */
+			rht_assign_locked(bkt, obj);
+
+		return NULL;
+	}
+
+	if (elasticity <= 0)
+		return ERR_PTR(-EAGAIN);
+
+	return ERR_PTR(-ENOENT);
+}
+
+static struct bucket_table *rhashtable_insert_one(
+	struct rhashtable *ht, struct rhash_lock_head __rcu **bkt,
+	struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj,
+	void *data)
+{
+	struct bucket_table *new_tbl;
+	struct rhash_head *head;
+
+	if (!IS_ERR_OR_NULL(data))
+		return ERR_PTR(-EEXIST);
+
+	if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
+		return ERR_CAST(data);
+
+	new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (new_tbl)
+		return new_tbl;
+
+	if (PTR_ERR(data) != -ENOENT)
+		return ERR_CAST(data);
 
-	err = -EAGAIN;
-	if (rhashtable_check_elasticity(ht, tbl, hash) ||
-	    rht_grow_above_100(ht, tbl))
-		goto exit;
+	if (unlikely(rht_grow_above_max(ht, tbl)))
+		return ERR_PTR(-E2BIG);
 
-	err = 0;
+	if (unlikely(rht_grow_above_100(ht, tbl)))
+		return ERR_PTR(-EAGAIN);
 
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+	head = rht_ptr(bkt, tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
+	if (ht->rhlist) {
+		struct rhlist_head *list;
 
-	rcu_assign_pointer(tbl->buckets[hash], obj);
+		list = container_of(obj, struct rhlist_head, rhead);
+		RCU_INIT_POINTER(list->next, NULL);
+	}
+
+	/* bkt is always the head of the list, so it holds
+	 * the lock, which we need to preserve
+	 */
+	rht_assign_locked(bkt, obj);
 
 	atomic_inc(&ht->nelems);
+	if (rht_grow_above_75(ht, tbl))
+		schedule_work(&ht->run_work);
+
+	return NULL;
+}
+
+static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
+				   struct rhash_head *obj)
+{
+	struct bucket_table *new_tbl;
+	struct bucket_table *tbl;
+	struct rhash_lock_head __rcu **bkt;
+	unsigned int hash;
+	void *data;
+
+	new_tbl = rcu_dereference(ht->tbl);
+
+	do {
+		tbl = new_tbl;
+		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
+		if (rcu_access_pointer(tbl->future_tbl))
+			/* Failure is OK */
+			bkt = rht_bucket_var(tbl, hash);
+		else
+			bkt = rht_bucket_insert(ht, tbl, hash);
+		if (bkt == NULL) {
+			new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+			data = ERR_PTR(-EAGAIN);
+		} else {
+			rht_lock(tbl, bkt);
+			data = rhashtable_lookup_one(ht, bkt, tbl,
+						     hash, key, obj);
+			new_tbl = rhashtable_insert_one(ht, bkt, tbl,
+							hash, obj, data);
+			if (PTR_ERR(new_tbl) != -EEXIST)
+				data = ERR_CAST(new_tbl);
+
+			rht_unlock(tbl, bkt);
+		}
+	} while (!IS_ERR_OR_NULL(new_tbl));
+
+	if (PTR_ERR(data) == -EAGAIN)
+		data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
+			       -EAGAIN);
+
+	return data;
+}
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+			     struct rhash_head *obj)
+{
+	void *data;
+
+	do {
+		rcu_read_lock();
+		data = rhashtable_try_insert(ht, key, obj);
+		rcu_read_unlock();
+	} while (PTR_ERR(data) == -EAGAIN);
 
-exit:
-	spin_unlock(rht_bucket_lock(tbl, hash));
+	return data;
+}
+EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
 
-	if (err == 0)
+/**
+ * rhashtable_walk_enter - Initialise an iterator
+ * @ht:		Table to walk over
+ * @iter:	Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
+{
+	iter->ht = ht;
+	iter->p = NULL;
+	iter->slot = 0;
+	iter->skip = 0;
+	iter->end_of_table = 0;
+
+	spin_lock(&ht->lock);
+	iter->walker.tbl =
+		rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
+	list_add(&iter->walker.list, &iter->walker.tbl->walkers);
+	spin_unlock(&ht->lock);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
+
+/**
+ * rhashtable_walk_exit - Free an iterator
+ * @iter:	Hash table Iterator
+ *
+ * This function frees resources allocated by rhashtable_walk_enter.
+ */
+void rhashtable_walk_exit(struct rhashtable_iter *iter)
+{
+	spin_lock(&iter->ht->lock);
+	if (iter->walker.tbl)
+		list_del(&iter->walker.list);
+	spin_unlock(&iter->ht->lock);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
+
+/**
+ * rhashtable_walk_start_check - Start a hash table walk
+ * @iter:	Hash table iterator
+ *
+ * Start a hash table walk at the current iterator position.  Note that we take
+ * the RCU lock in all cases including when we return an error.  So you must
+ * always call rhashtable_walk_stop to clean up.
+ *
+ * Returns zero if successful.
+ *
+ * Returns -EAGAIN if resize event occured.  Note that the iterator
+ * will rewind back to the beginning and you may use it immediately
+ * by calling rhashtable_walk_next.
+ *
+ * rhashtable_walk_start is defined as an inline variant that returns
+ * void. This is preferred in cases where the caller would ignore
+ * resize events and always continue.
+ */
+int rhashtable_walk_start_check(struct rhashtable_iter *iter)
+	__acquires(RCU)
+{
+	struct rhashtable *ht = iter->ht;
+	bool rhlist = ht->rhlist;
+
+	rcu_read_lock();
+
+	spin_lock(&ht->lock);
+	if (iter->walker.tbl)
+		list_del(&iter->walker.list);
+	spin_unlock(&ht->lock);
+
+	if (iter->end_of_table)
+		return 0;
+	if (!iter->walker.tbl) {
+		iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
+		iter->slot = 0;
+		iter->skip = 0;
+		return -EAGAIN;
+	}
+
+	if (iter->p && !rhlist) {
+		/*
+		 * We need to validate that 'p' is still in the table, and
+		 * if so, update 'skip'
+		 */
+		struct rhash_head *p;
+		int skip = 0;
+		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+			skip++;
+			if (p == iter->p) {
+				iter->skip = skip;
+				goto found;
+			}
+		}
+		iter->p = NULL;
+	} else if (iter->p && rhlist) {
+		/* Need to validate that 'list' is still in the table, and
+		 * if so, update 'skip' and 'p'.
+		 */
+		struct rhash_head *p;
+		struct rhlist_head *list;
+		int skip = 0;
+		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+			for (list = container_of(p, struct rhlist_head, rhead);
+			     list;
+			     list = rcu_dereference(list->next)) {
+				skip++;
+				if (list == iter->list) {
+					iter->p = p;
+					iter->skip = skip;
+					goto found;
+				}
+			}
+		}
+		iter->p = NULL;
+	}
+found:
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
+
+/**
+ * __rhashtable_walk_find_next - Find the next element in a table (or the first
+ * one in case of a new walk).
+ *
+ * @iter:	Hash table iterator
+ *
+ * Returns the found object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.
+ */
+static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
+{
+	struct bucket_table *tbl = iter->walker.tbl;
+	struct rhlist_head *list = iter->list;
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+	bool rhlist = ht->rhlist;
+
+	if (!tbl)
 		return NULL;
-	else if (err == -EAGAIN)
-		return tbl;
+
+	for (; iter->slot < tbl->size; iter->slot++) {
+		int skip = iter->skip;
+
+		rht_for_each_rcu(p, tbl, iter->slot) {
+			if (rhlist) {
+				list = container_of(p, struct rhlist_head,
+						    rhead);
+				do {
+					if (!skip)
+						goto next;
+					skip--;
+					list = rcu_dereference(list->next);
+				} while (list);
+
+				continue;
+			}
+			if (!skip)
+				break;
+			skip--;
+		}
+
+next:
+		if (!rht_is_a_nulls(p)) {
+			iter->skip++;
+			iter->p = p;
+			iter->list = list;
+			return rht_obj(ht, rhlist ? &list->rhead : p);
+		}
+
+		iter->skip = 0;
+	}
+
+	iter->p = NULL;
+
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (iter->walker.tbl) {
+		iter->slot = 0;
+		iter->skip = 0;
+		return ERR_PTR(-EAGAIN);
+	} else {
+		iter->end_of_table = true;
+	}
+
+	return NULL;
+}
+
+/**
+ * rhashtable_walk_next - Return the next object and advance the iterator
+ * @iter:	Hash table iterator
+ *
+ * Note that you must call rhashtable_walk_stop when you are finished
+ * with the walk.
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_next(struct rhashtable_iter *iter)
+{
+	struct rhlist_head *list = iter->list;
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+	bool rhlist = ht->rhlist;
+
+	if (p) {
+		if (!rhlist || !(list = rcu_dereference(list->next))) {
+			p = rcu_dereference(p->next);
+			list = container_of(p, struct rhlist_head, rhead);
+		}
+		if (!rht_is_a_nulls(p)) {
+			iter->skip++;
+			iter->p = p;
+			iter->list = list;
+			return rht_obj(ht, rhlist ? &list->rhead : p);
+		}
+
+		/* At the end of this slot, switch to next one and then find
+		 * next entry from that point.
+		 */
+		iter->skip = 0;
+		iter->slot++;
+	}
+
+	return __rhashtable_walk_find_next(iter);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_next);
+
+/**
+ * rhashtable_walk_peek - Return the next object but don't advance the iterator
+ * @iter:	Hash table iterator
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_peek(struct rhashtable_iter *iter)
+{
+	struct rhlist_head *list = iter->list;
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+
+	if (p)
+		return rht_obj(ht, ht->rhlist ? &list->rhead : p);
+
+	/* No object found in current iter, find next one in the table. */
+
+	if (iter->skip) {
+		/* A nonzero skip value points to the next entry in the table
+		 * beyond that last one that was found. Decrement skip so
+		 * we find the current value. __rhashtable_walk_find_next
+		 * will restore the original value of skip assuming that
+		 * the table hasn't changed.
+		 */
+		iter->skip--;
+	}
+
+	return __rhashtable_walk_find_next(iter);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
+
+/**
+ * rhashtable_walk_stop - Finish a hash table walk
+ * @iter:	Hash table iterator
+ *
+ * Finish a hash table walk.  Does not reset the iterator to the start of the
+ * hash table.
+ */
+void rhashtable_walk_stop(struct rhashtable_iter *iter)
+	__releases(RCU)
+{
+	struct rhashtable *ht;
+	struct bucket_table *tbl = iter->walker.tbl;
+
+	if (!tbl)
+		goto out;
+
+	ht = iter->ht;
+
+	spin_lock(&ht->lock);
+	if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
+		/* This bucket table is being freed, don't re-link it. */
+		iter->walker.tbl = NULL;
 	else
-		return ERR_PTR(err);
+		list_add(&iter->walker.list, &tbl->walkers);
+	spin_unlock(&ht->lock);
+
+out:
+	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
 
 static size_t rounded_hashtable_size(const struct rhashtable_params *params)
 {
-	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
-		   (unsigned long)params->min_size);
+	size_t retsize;
+
+	if (params->nelem_hint)
+		retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
+			      (unsigned long)params->min_size);
+	else
+		retsize = max(HASH_DEFAULT_SIZE,
+			      (unsigned long)params->min_size);
+
+	return retsize;
 }
 
 static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
@@ -431,21 +966,58 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
 	return jhash2(key, length, seed);
 }
 
+/**
+ * rhashtable_init - initialize a new hash table
+ * @ht:		hash table to be initialized
+ * @params:	configuration parameters
+ *
+ * Initializes a new hash table based on the provided configuration
+ * parameters. A table can be configured either with a variable or
+ * fixed length key:
+ *
+ * Configuration Example 1: Fixed length keys
+ * struct test_obj {
+ *	int			key;
+ *	void *			my_member;
+ *	struct rhash_head	node;
+ * };
+ *
+ * struct rhashtable_params params = {
+ *	.head_offset = offsetof(struct test_obj, node),
+ *	.key_offset = offsetof(struct test_obj, key),
+ *	.key_len = sizeof(int),
+ *	.hashfn = jhash,
+ * };
+ *
+ * Configuration Example 2: Variable length keys
+ * struct test_obj {
+ *	[...]
+ *	struct rhash_head	node;
+ * };
+ *
+ * u32 my_hash_fn(const void *data, u32 len, u32 seed)
+ * {
+ *	struct test_obj *obj = data;
+ *
+ *	return [... hash ...];
+ * }
+ *
+ * struct rhashtable_params params = {
+ *	.head_offset = offsetof(struct test_obj, node),
+ *	.hashfn = jhash,
+ *	.obj_hashfn = my_hash_fn,
+ * };
+ */
 int rhashtable_init(struct rhashtable *ht,
 		    const struct rhashtable_params *params)
 {
 	struct bucket_table *tbl;
 	size_t size;
 
-	size = HASH_DEFAULT_SIZE;
-
 	if ((!params->key_len && !params->obj_hashfn) ||
 	    (params->obj_hashfn && !params->obj_cmpfn))
 		return -EINVAL;
 
-	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
-		return -EINVAL;
-
 	memset(ht, 0, sizeof(*ht));
 	mutex_init(&ht->mutex);
 	spin_lock_init(&ht->lock);
@@ -454,39 +1026,18 @@ int rhashtable_init(struct rhashtable *ht,
 	if (params->min_size)
 		ht->p.min_size = roundup_pow_of_two(params->min_size);
 
-	if (params->max_size)
-		ht->p.max_size = rounddown_pow_of_two(params->max_size);
+	/* Cap total entries at 2^31 to avoid nelems overflow. */
+	ht->max_elems = 1u << 31;
 
-	if (params->insecure_max_entries)
-		ht->p.insecure_max_entries =
-			rounddown_pow_of_two(params->insecure_max_entries);
-	else
-		ht->p.insecure_max_entries = ht->p.max_size * 2;
-
-	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
+	if (params->max_size) {
+		ht->p.max_size = rounddown_pow_of_two(params->max_size);
+		if (ht->p.max_size < ht->max_elems / 2)
+			ht->max_elems = ht->p.max_size * 2;
+	}
 
-	if (params->nelem_hint)
-		size = rounded_hashtable_size(&ht->p);
-
-	/* The maximum (not average) chain length grows with the
-	 * size of the hash table, at a rate of (log N)/(log log N).
-	 * The value of 16 is selected so that even if the hash
-	 * table grew to 2^32 you would not expect the maximum
-	 * chain length to exceed it unless we are under attack
-	 * (or extremely unlucky).
-	 *
-	 * As this limit is only to detect attacks, we don't need
-	 * to set it to a lower value as you'd need the chain
-	 * length to vastly exceed 16 to have any real effect
-	 * on the system.
-	 */
-	if (!params->insecure_elasticity)
-		ht->elasticity = 16;
+	ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
 
-	if (params->locks_mul)
-		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
-	else
-		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
+	size = rounded_hashtable_size(&ht->p);
 
 	ht->key_len = ht->p.key_len;
 	if (!params->hashfn) {
@@ -498,9 +1049,16 @@ int rhashtable_init(struct rhashtable *ht,
 		}
 	}
 
+	/*
+	 * This is api initialization and thus we need to guarantee the
+	 * initial rhashtable allocation. Upon failure, retry with the
+	 * smallest possible size with __GFP_NOFAIL semantics.
+	 */
 	tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
-	if (tbl == NULL)
-		return -ENOMEM;
+	if (unlikely(tbl == NULL)) {
+		size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
+		tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
+	}
 
 	atomic_set(&ht->nelems, 0);
 
@@ -510,15 +1068,170 @@ int rhashtable_init(struct rhashtable *ht,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(rhashtable_init);
 
-void rhashtable_destroy(struct rhashtable *ht)
+/**
+ * rhltable_init - initialize a new hash list table
+ * @hlt:	hash list table to be initialized
+ * @params:	configuration parameters
+ *
+ * Initializes a new hash list table.
+ *
+ * See documentation for rhashtable_init.
+ */
+int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
 {
-	struct bucket_table *tbl;
+	int err;
+
+	err = rhashtable_init(&hlt->ht, params);
+	hlt->ht.rhlist = true;
+	return err;
+}
+EXPORT_SYMBOL_GPL(rhltable_init);
+
+static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
+				void (*free_fn)(void *ptr, void *arg),
+				void *arg)
+{
+	struct rhlist_head *list;
+
+	if (!ht->rhlist) {
+		free_fn(rht_obj(ht, obj), arg);
+		return;
+	}
+
+	list = container_of(obj, struct rhlist_head, rhead);
+	do {
+		obj = &list->rhead;
+		list = rht_dereference(list->next, ht);
+		free_fn(rht_obj(ht, obj), arg);
+	} while (list);
+}
+
+/**
+ * rhashtable_free_and_destroy - free elements and destroy hash table
+ * @ht:		the hash table to destroy
+ * @free_fn:	callback to release resources of element
+ * @arg:	pointer passed to free_fn
+ *
+ * Stops an eventual async resize. If defined, invokes free_fn for each
+ * element to releasal resources. Please note that RCU protected
+ * readers may still be accessing the elements. Releasing of resources
+ * must occur in a compatible manner. Then frees the bucket array.
+ *
+ * This function will eventually sleep to wait for an async resize
+ * to complete. The caller is responsible that no further write operations
+ * occurs in parallel.
+ */
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+				 void (*free_fn)(void *ptr, void *arg),
+				 void *arg)
+{
+	struct bucket_table *tbl, *next_tbl;
+	unsigned int i;
 
 	cancel_work_sync(&ht->run_work);
 
 	mutex_lock(&ht->mutex);
 	tbl = rht_dereference(ht->tbl, ht);
+restart:
+	if (free_fn) {
+		for (i = 0; i < tbl->size; i++) {
+			struct rhash_head *pos, *next;
+
+			cond_resched();
+			for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
+			     next = !rht_is_a_nulls(pos) ?
+					rht_dereference(pos->next, ht) : NULL;
+			     !rht_is_a_nulls(pos);
+			     pos = next,
+			     next = !rht_is_a_nulls(pos) ?
+					rht_dereference(pos->next, ht) : NULL)
+				rhashtable_free_one(ht, pos, free_fn, arg);
+		}
+	}
+
+	next_tbl = rht_dereference(tbl->future_tbl, ht);
 	bucket_table_free(tbl);
+	if (next_tbl) {
+		tbl = next_tbl;
+		goto restart;
+	}
 	mutex_unlock(&ht->mutex);
 }
+EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
+
+void rhashtable_destroy(struct rhashtable *ht)
+{
+	return rhashtable_free_and_destroy(ht, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(rhashtable_destroy);
+
+struct rhash_lock_head __rcu **__rht_bucket_nested(
+	const struct bucket_table *tbl, unsigned int hash)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	unsigned int index = hash & ((1 << tbl->nest) - 1);
+	unsigned int size = tbl->size >> tbl->nest;
+	unsigned int subhash = hash;
+	union nested_table *ntbl;
+
+	ntbl = nested_table_top(tbl);
+	ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
+	subhash >>= tbl->nest;
+
+	while (ntbl && size > (1 << shift)) {
+		index = subhash & ((1 << shift) - 1);
+		ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
+						  tbl, hash);
+		size >>= shift;
+		subhash >>= shift;
+	}
+
+	if (!ntbl)
+		return NULL;
+
+	return &ntbl[subhash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(__rht_bucket_nested);
+
+struct rhash_lock_head __rcu **rht_bucket_nested(
+	const struct bucket_table *tbl, unsigned int hash)
+{
+	static struct rhash_lock_head __rcu *rhnull;
+
+	if (!rhnull)
+		INIT_RHT_NULLS_HEAD(rhnull);
+	return __rht_bucket_nested(tbl, hash) ?: &rhnull;
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested);
+
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(
+	struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	unsigned int index = hash & ((1 << tbl->nest) - 1);
+	unsigned int size = tbl->size >> tbl->nest;
+	union nested_table *ntbl;
+
+	ntbl = nested_table_top(tbl);
+	hash >>= tbl->nest;
+	ntbl = nested_table_alloc(ht, &ntbl[index].table,
+				  size <= (1 << shift));
+
+	while (ntbl && size > (1 << shift)) {
+		index = hash & ((1 << shift) - 1);
+		size >>= shift;
+		hash >>= shift;
+		ntbl = nested_table_alloc(ht, &ntbl[index].table,
+					  size <= (1 << shift));
+	}
+
+	if (!ntbl)
+		return NULL;
+
+	return &ntbl[hash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
diff --git a/linux/six.c b/linux/six.c
index 53280044..fe721891 100644
--- a/linux/six.c
+++ b/linux/six.c
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
 #include <linux/six.h>
+#include <linux/slab.h>
 
 #ifdef DEBUG
 #define EBUG_ON(cond)		BUG_ON(cond)
@@ -309,6 +310,9 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 				wake_up_process(p);
 		}
 
+		if (ret)
+			six_acquire(&lock->dep_map, 1);
+
 		return ret;
 	}
 
@@ -560,6 +564,7 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 	    lock->readers) {
 		smp_mb(); /* unlock barrier */
 		this_cpu_dec(*lock->readers);
+		smp_mb(); /* between unlocking and checking for waiters */
 		state.v = READ_ONCE(lock->state.v);
 	} else {
 		EBUG_ON(!(lock->state.v & l[type].held_mask));
@@ -705,6 +710,34 @@ void six_lock_wakeup_all(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
 
+struct free_pcpu_rcu {
+	struct rcu_head		rcu;
+	void __percpu		*p;
+};
+
+static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
+{
+	struct free_pcpu_rcu *rcu =
+		container_of(_rcu, struct free_pcpu_rcu, rcu);
+
+	free_percpu(rcu->p);
+	kfree(rcu);
+}
+
+void six_lock_pcpu_free_rcu(struct six_lock *lock)
+{
+	struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
+
+	if (!rcu)
+		return;
+
+	rcu->p = lock->readers;
+	lock->readers = NULL;
+
+	call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
+
 void six_lock_pcpu_free(struct six_lock *lock)
 {
 	BUG_ON(lock->readers && pcpu_read_count(lock));
@@ -717,8 +750,6 @@ EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
 
 void six_lock_pcpu_alloc(struct six_lock *lock)
 {
-	BUG_ON(lock->readers && pcpu_read_count(lock));
-	BUG_ON(lock->state.read_lock);
 #ifdef __KERNEL__
 	if (!lock->readers)
 		lock->readers = alloc_percpu(unsigned);
author	Kent Overstreet <kent.overstreet@gmail.com>	2021-03-28 17:38:28 -0400
committer	Kent Overstreet <kent.overstreet@gmail.com>	2021-03-29 00:22:38 -0400
commit	a2094890a90a2f865e49f94e8448deca7e5852ef (patch)
tree	11bf5f426509e288b2b3482492c805a26bb1885a
parent	bb6eccc2ecd4728871bfc70462d3a4a20daa9d68 (diff)