summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2021-03-28 17:38:28 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2021-03-29 00:22:38 -0400
commita2094890a90a2f865e49f94e8448deca7e5852ef (patch)
tree11bf5f426509e288b2b3482492c805a26bb1885a
parentbb6eccc2ecd4728871bfc70462d3a4a20daa9d68 (diff)
Update bcachefs sources to 18686af684 bcachefs: Inode backpointersv0.13
-rw-r--r--.bcachefs_revision2
-rw-r--r--Makefile4
-rw-r--r--cmd_debug.c4
-rw-r--r--include/linux/list_nulls.h145
-rw-r--r--include/linux/overflow.h346
-rw-r--r--include/linux/poison.h85
-rw-r--r--include/linux/random.h1
-rw-r--r--include/linux/rcupdate.h28
-rw-r--r--include/linux/rhashtable-types.h135
-rw-r--r--include/linux/rhashtable.h1205
-rw-r--r--include/linux/six.h1
-rw-r--r--include/linux/slab.h1
-rw-r--r--include/linux/types.h2
-rw-r--r--libbcachefs/bcachefs_format.h31
-rw-r--r--libbcachefs/bkey.c25
-rw-r--r--libbcachefs/bkey.h78
-rw-r--r--libbcachefs/bkey_methods.c46
-rw-r--r--libbcachefs/bkey_sort.c2
-rw-r--r--libbcachefs/bset.c84
-rw-r--r--libbcachefs/bset.h22
-rw-r--r--libbcachefs/btree_cache.c12
-rw-r--r--libbcachefs/btree_gc.c28
-rw-r--r--libbcachefs/btree_gc.h10
-rw-r--r--libbcachefs/btree_io.c32
-rw-r--r--libbcachefs/btree_io.h30
-rw-r--r--libbcachefs/btree_iter.c103
-rw-r--r--libbcachefs/btree_iter.h4
-rw-r--r--libbcachefs/btree_key_cache.c185
-rw-r--r--libbcachefs/btree_key_cache.h8
-rw-r--r--libbcachefs/btree_types.h32
-rw-r--r--libbcachefs/btree_update.h2
-rw-r--r--libbcachefs/btree_update_interior.c79
-rw-r--r--libbcachefs/btree_update_leaf.c45
-rw-r--r--libbcachefs/debug.c10
-rw-r--r--libbcachefs/dirent.c18
-rw-r--r--libbcachefs/dirent.h6
-rw-r--r--libbcachefs/ec.c1
-rw-r--r--libbcachefs/extents.c7
-rw-r--r--libbcachefs/extents.h18
-rw-r--r--libbcachefs/fs-common.c68
-rw-r--r--libbcachefs/fsck.c43
-rw-r--r--libbcachefs/inode.c19
-rw-r--r--libbcachefs/inode.h3
-rw-r--r--libbcachefs/io.c5
-rw-r--r--libbcachefs/journal.c11
-rw-r--r--libbcachefs/journal_io.c2
-rw-r--r--libbcachefs/journal_reclaim.c4
-rw-r--r--libbcachefs/recovery.c24
-rw-r--r--libbcachefs/tests.c29
-rw-r--r--linux/rhashtable.c1081
-rw-r--r--linux/six.c35
51 files changed, 3435 insertions, 766 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 976139a3..385c19f6 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-ad68801b939cdda0530f54cd07b3212e98fe1d75
+18686af68412ebfad9c2adc6ee976ffdb9e1b886
diff --git a/Makefile b/Makefile
index 6999b93a..3fe96048 100644
--- a/Makefile
+++ b/Makefile
@@ -156,6 +156,10 @@ update-bcachefs-sources:
git add linux/six.c
cp $(LINUX_DIR)/include/linux/six.h include/linux/
git add include/linux/six.h
+ cp $(LINUX_DIR)/include/linux/list_nulls.h include/linux/
+ git add include/linux/list_nulls.h
+ cp $(LINUX_DIR)/include/linux/poison.h include/linux/
+ git add include/linux/poison.h
$(RM) libbcachefs/*.mod.c
git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision
git add .bcachefs_revision
diff --git a/cmd_debug.c b/cmd_debug.c
index 3baa6978..4938ec07 100644
--- a/cmd_debug.c
+++ b/cmd_debug.c
@@ -323,9 +323,7 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b)
le64_to_cpu(i->journal_seq));
offset += sectors;
- for (k = i->start;
- k != vstruct_last(i);
- k = bkey_next_skip_noops(k, vstruct_last(i))) {
+ for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) {
struct bkey u;
char buf[4096];
diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
new file mode 100644
index 00000000..fa6e8471
--- /dev/null
+++ b/include/linux/list_nulls.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_LIST_NULLS_H
+#define _LINUX_LIST_NULLS_H
+
+#include <linux/poison.h>
+#include <linux/const.h>
+
+/*
+ * Special version of lists, where end of list is not a NULL pointer,
+ * but a 'nulls' marker, which can have many different values.
+ * (up to 2^31 different values guaranteed on all platforms)
+ *
+ * In the standard hlist, termination of a list is the NULL pointer.
+ * In this special 'nulls' variant, we use the fact that objects stored in
+ * a list are aligned on a word (4 or 8 bytes alignment).
+ * We therefore use the last significant bit of 'ptr' :
+ * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
+ * Set to 0 : This is a pointer to some object (ptr)
+ */
+
+struct hlist_nulls_head {
+ struct hlist_nulls_node *first;
+};
+
+struct hlist_nulls_node {
+ struct hlist_nulls_node *next, **pprev;
+};
+#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
+#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
+ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
+
+#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_nulls_entry_safe(ptr, type, member) \
+ ({ typeof(ptr) ____ptr = (ptr); \
+ !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
+ })
+/**
+ * ptr_is_a_nulls - Test if a ptr is a nulls
+ * @ptr: ptr to be tested
+ *
+ */
+static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
+{
+ return ((unsigned long)ptr & 1);
+}
+
+/**
+ * get_nulls_value - Get the 'nulls' value of the end of chain
+ * @ptr: end of chain
+ *
+ * Should be called only if is_a_nulls(ptr);
+ */
+static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
+{
+ return ((unsigned long)ptr) >> 1;
+}
+
+/**
+ * hlist_nulls_unhashed - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not.
+ */
+static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
+{
+ return !h->pprev;
+}
+
+/**
+ * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not. Unlike hlist_nulls_unhashed(), this
+ * function may be used locklessly.
+ */
+static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h)
+{
+ return !READ_ONCE(h->pprev);
+}
+
+static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
+{
+ return is_a_nulls(READ_ONCE(h->first));
+}
+
+static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
+ struct hlist_nulls_head *h)
+{
+ struct hlist_nulls_node *first = h->first;
+
+ n->next = first;
+ WRITE_ONCE(n->pprev, &h->first);
+ h->first = n;
+ if (!is_a_nulls(first))
+ WRITE_ONCE(first->pprev, &n->next);
+}
+
+static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
+{
+ struct hlist_nulls_node *next = n->next;
+ struct hlist_nulls_node **pprev = n->pprev;
+
+ WRITE_ONCE(*pprev, next);
+ if (!is_a_nulls(next))
+ WRITE_ONCE(next->pprev, pprev);
+}
+
+static inline void hlist_nulls_del(struct hlist_nulls_node *n)
+{
+ __hlist_nulls_del(n);
+ WRITE_ONCE(n->pprev, LIST_POISON2);
+}
+
+/**
+ * hlist_nulls_for_each_entry - iterate over list of given type
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct hlist_node to use as a loop cursor.
+ * @head: the head for your list.
+ * @member: the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry(tpos, pos, head, member) \
+ for (pos = (head)->first; \
+ (!is_a_nulls(pos)) && \
+ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+ pos = pos->next)
+
+/**
+ * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct hlist_node to use as a loop cursor.
+ * @member: the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry_from(tpos, pos, member) \
+ for (; (!is_a_nulls(pos)) && \
+ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+ pos = pos->next)
+
+#endif
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
new file mode 100644
index 00000000..ef74051d
--- /dev/null
+++ b/include/linux/overflow.h
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+#ifndef __LINUX_OVERFLOW_H
+#define __LINUX_OVERFLOW_H
+
+#include <linux/compiler.h>
+#include <linux/limits.h>
+
+/*
+ * In the fallback code below, we need to compute the minimum and
+ * maximum values representable in a given type. These macros may also
+ * be useful elsewhere, so we provide them outside the
+ * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
+ *
+ * It would seem more obvious to do something like
+ *
+ * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
+ * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
+ *
+ * Unfortunately, the middle expressions, strictly speaking, have
+ * undefined behaviour, and at least some versions of gcc warn about
+ * the type_max expression (but not if -fsanitize=undefined is in
+ * effect; in that case, the warning is deferred to runtime...).
+ *
+ * The slightly excessive casting in type_min is to make sure the
+ * macros also produce sensible values for the exotic type _Bool. [The
+ * overflow checkers only almost work for _Bool, but that's
+ * a-feature-not-a-bug, since people shouldn't be doing arithmetic on
+ * _Bools. Besides, the gcc builtins don't allow _Bool* as third
+ * argument.]
+ *
+ * Idea stolen from
+ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
+ * credit to Christian Biere.
+ */
+#define is_signed_type(type) (((type)(-1)) < (type)1)
+#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
+#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
+#define type_min(T) ((T)((T)-type_max(T)-(T)1))
+
+/*
+ * Avoids triggering -Wtype-limits compilation warning,
+ * while using unsigned data types to check a < 0.
+ */
+#define is_non_negative(a) ((a) > 0 || (a) == 0)
+#define is_negative(a) (!(is_non_negative(a)))
+
+/*
+ * Allows for effectively applying __must_check to a macro so we can have
+ * both the type-agnostic benefits of the macros while also being able to
+ * enforce that the return value is, in fact, checked.
+ */
+static inline bool __must_check __must_check_overflow(bool overflow)
+{
+ return unlikely(overflow);
+}
+
+#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
+/*
+ * For simplicity and code hygiene, the fallback code below insists on
+ * a, b and *d having the same type (similar to the min() and max()
+ * macros), whereas gcc's type-generic overflow checkers accept
+ * different types. Hence we don't just make check_add_overflow an
+ * alias for __builtin_add_overflow, but add type checks similar to
+ * below.
+ */
+#define check_add_overflow(a, b, d) __must_check_overflow(({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ __builtin_add_overflow(__a, __b, __d); \
+}))
+
+#define check_sub_overflow(a, b, d) __must_check_overflow(({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ __builtin_sub_overflow(__a, __b, __d); \
+}))
+
+#define check_mul_overflow(a, b, d) __must_check_overflow(({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ __builtin_mul_overflow(__a, __b, __d); \
+}))
+
+#else
+
+
+/* Checking for unsigned overflow is relatively easy without causing UB. */
+#define __unsigned_add_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = __a + __b; \
+ *__d < __a; \
+})
+#define __unsigned_sub_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = __a - __b; \
+ __a < __b; \
+})
+/*
+ * If one of a or b is a compile-time constant, this avoids a division.
+ */
+#define __unsigned_mul_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = __a * __b; \
+ __builtin_constant_p(__b) ? \
+ __b > 0 && __a > type_max(typeof(__a)) / __b : \
+ __a > 0 && __b > type_max(typeof(__b)) / __a; \
+})
+
+/*
+ * For signed types, detecting overflow is much harder, especially if
+ * we want to avoid UB. But the interface of these macros is such that
+ * we must provide a result in *d, and in fact we must produce the
+ * result promised by gcc's builtins, which is simply the possibly
+ * wrapped-around value. Fortunately, we can just formally do the
+ * operations in the widest relevant unsigned type (u64) and then
+ * truncate the result - gcc is smart enough to generate the same code
+ * with and without the (u64) casts.
+ */
+
+/*
+ * Adding two signed integers can overflow only if they have the same
+ * sign, and overflow has happened iff the result has the opposite
+ * sign.
+ */
+#define __signed_add_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = (u64)__a + (u64)__b; \
+ (((~(__a ^ __b)) & (*__d ^ __a)) \
+ & type_min(typeof(__a))) != 0; \
+})
+
+/*
+ * Subtraction is similar, except that overflow can now happen only
+ * when the signs are opposite. In this case, overflow has happened if
+ * the result has the opposite sign of a.
+ */
+#define __signed_sub_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = (u64)__a - (u64)__b; \
+ ((((__a ^ __b)) & (*__d ^ __a)) \
+ & type_min(typeof(__a))) != 0; \
+})
+
+/*
+ * Signed multiplication is rather hard. gcc always follows C99, so
+ * division is truncated towards 0. This means that we can write the
+ * overflow check like this:
+ *
+ * (a > 0 && (b > MAX/a || b < MIN/a)) ||
+ * (a < -1 && (b > MIN/a || b < MAX/a) ||
+ * (a == -1 && b == MIN)
+ *
+ * The redundant casts of -1 are to silence an annoying -Wtype-limits
+ * (included in -Wextra) warning: When the type is u8 or u16, the
+ * __b_c_e in check_mul_overflow obviously selects
+ * __unsigned_mul_overflow, but unfortunately gcc still parses this
+ * code and warns about the limited range of __b.
+ */
+
+#define __signed_mul_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ typeof(a) __tmax = type_max(typeof(a)); \
+ typeof(a) __tmin = type_min(typeof(a)); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = (u64)__a * (u64)__b; \
+ (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \
+ (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \
+ (__b == (typeof(__b))-1 && __a == __tmin); \
+})
+
+
+#define check_add_overflow(a, b, d) __must_check_overflow( \
+ __builtin_choose_expr(is_signed_type(typeof(a)), \
+ __signed_add_overflow(a, b, d), \
+ __unsigned_add_overflow(a, b, d)))
+
+#define check_sub_overflow(a, b, d) __must_check_overflow( \
+ __builtin_choose_expr(is_signed_type(typeof(a)), \
+ __signed_sub_overflow(a, b, d), \
+ __unsigned_sub_overflow(a, b, d)))
+
+#define check_mul_overflow(a, b, d) __must_check_overflow( \
+ __builtin_choose_expr(is_signed_type(typeof(a)), \
+ __signed_mul_overflow(a, b, d), \
+ __unsigned_mul_overflow(a, b, d)))
+
+#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
+
+/** check_shl_overflow() - Calculate a left-shifted value and check overflow
+ *
+ * @a: Value to be shifted
+ * @s: How many bits left to shift
+ * @d: Pointer to where to store the result
+ *
+ * Computes *@d = (@a << @s)
+ *
+ * Returns true if '*d' cannot hold the result or when 'a << s' doesn't
+ * make sense. Example conditions:
+ * - 'a << s' causes bits to be lost when stored in *d.
+ * - 's' is garbage (e.g. negative) or so large that the result of
+ * 'a << s' is guaranteed to be 0.
+ * - 'a' is negative.
+ * - 'a << s' sets the sign bit, if any, in '*d'.
+ *
+ * '*d' will hold the results of the attempted shift, but is not
+ * considered "safe for use" if false is returned.
+ */
+#define check_shl_overflow(a, s, d) __must_check_overflow(({ \
+ typeof(a) _a = a; \
+ typeof(s) _s = s; \
+ typeof(d) _d = d; \
+ u64 _a_full = _a; \
+ unsigned int _to_shift = \
+ is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \
+ *_d = (_a_full << _to_shift); \
+ (_to_shift != _s || is_negative(*_d) || is_negative(_a) || \
+ (*_d >> _to_shift) != _a); \
+}))
+
+/**
+ * array_size() - Calculate size of 2-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ *
+ * Calculates size of 2-dimensional array: @a * @b.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array_size(size_t a, size_t b)
+{
+ size_t bytes;
+
+ if (check_mul_overflow(a, b, &bytes))
+ return SIZE_MAX;
+
+ return bytes;
+}
+
+/**
+ * array3_size() - Calculate size of 3-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ * @c: dimension three
+ *
+ * Calculates size of 3-dimensional array: @a * @b * @c.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
+{
+ size_t bytes;
+
+ if (check_mul_overflow(a, b, &bytes))
+ return SIZE_MAX;
+ if (check_mul_overflow(bytes, c, &bytes))
+ return SIZE_MAX;
+
+ return bytes;
+}
+
+/*
+ * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for
+ * struct_size() below.
+ */
+static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c)
+{
+ size_t bytes;
+
+ if (check_mul_overflow(a, b, &bytes))
+ return SIZE_MAX;
+ if (check_add_overflow(bytes, c, &bytes))
+ return SIZE_MAX;
+
+ return bytes;
+}
+
+/**
+ * struct_size() - Calculate size of structure with trailing array.
+ * @p: Pointer to the structure.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of memory needed for structure @p followed by an
+ * array of @count number of @member elements.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define struct_size(p, member, count) \
+ __ab_c_size(count, \
+ sizeof(*(p)->member) + __must_be_array((p)->member),\
+ sizeof(*(p)))
+
+/**
+ * flex_array_size() - Calculate size of a flexible array member
+ * within an enclosing structure.
+ *
+ * @p: Pointer to the structure.
+ * @member: Name of the flexible array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of a flexible array of @count number of @member
+ * elements, at the end of structure @p.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define flex_array_size(p, member, count) \
+ array_size(count, \
+ sizeof(*(p)->member) + __must_be_array((p)->member))
+
+#endif /* __LINUX_OVERFLOW_H */
diff --git a/include/linux/poison.h b/include/linux/poison.h
new file mode 100644
index 00000000..dc8ae5d8
--- /dev/null
+++ b/include/linux/poison.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_POISON_H
+#define _LINUX_POISON_H
+
+/********** include/linux/list.h **********/
+
+/*
+ * Architectures might want to move the poison pointer offset
+ * into some well-recognized area such as 0xdead000000000000,
+ * that is also not mappable by user-space exploits:
+ */
+#ifdef CONFIG_ILLEGAL_POINTER_VALUE
+# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL)
+#else
+# define POISON_POINTER_DELTA 0
+#endif
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA)
+#define LIST_POISON2 ((void *) 0x122 + POISON_POINTER_DELTA)
+
+/********** include/linux/timer.h **********/
+#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
+
+/********** mm/page_poison.c **********/
+#ifdef CONFIG_PAGE_POISONING_ZERO
+#define PAGE_POISON 0x00
+#else
+#define PAGE_POISON 0xaa
+#endif
+
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING ((void *) 0x400 + POISON_POINTER_DELTA)
+
+/********** mm/slab.c **********/
+/*
+ * Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */
+#define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */
+
+#define SLUB_RED_INACTIVE 0xbb
+#define SLUB_RED_ACTIVE 0xcc
+
+/* ...and for poisoning */
+#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */
+#define POISON_FREE 0x6b /* for use-after-free poisoning */
+#define POISON_END 0xa5 /* end-byte of poisoning */
+
+/********** arch/$ARCH/mm/init.c **********/
+#define POISON_FREE_INITMEM 0xcc
+
+/********** arch/ia64/hp/common/sba_iommu.c **********/
+/*
+ * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
+ * value of "SBAIOMMU POISON\0" for spill-over poisoning.
+ */
+
+/********** fs/jbd/journal.c **********/
+#define JBD_POISON_FREE 0x5b
+#define JBD2_POISON_FREE 0x5c
+
+/********** drivers/base/dmapool.c **********/
+#define POOL_POISON_FREED 0xa7 /* !inuse */
+#define POOL_POISON_ALLOCATED 0xa9 /* !initted */
+
+/********** drivers/atm/ **********/
+#define ATM_POISON_FREE 0x12
+#define ATM_POISON 0xdeadbeef
+
+/********** kernel/mutexes **********/
+#define MUTEX_DEBUG_INIT 0x11
+#define MUTEX_DEBUG_FREE 0x22
+#define MUTEX_POISON_WW_CTX ((void *) 0x500 + POISON_POINTER_DELTA)
+
+/********** security/ **********/
+#define KEY_DESTROY 0xbd
+
+#endif
diff --git a/include/linux/random.h b/include/linux/random.h
index c38ae46d..28c595a0 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -45,6 +45,7 @@ static inline type get_random_##type(void) \
get_random_type(int);
get_random_type(long);
+get_random_type(u32);
get_random_type(u64);
#endif /* _LINUX_RANDOM_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index c99d78a8..ae292241 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -13,4 +13,32 @@
#define RCU_INIT_POINTER(p, v) WRITE_ONCE(p, v)
+/* Has the specified rcu_head structure been handed to call_rcu()? */
+
+/**
+ * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
+ * @rhp: The rcu_head structure to initialize.
+ *
+ * If you intend to invoke rcu_head_after_call_rcu() to test whether a
+ * given rcu_head structure has already been passed to call_rcu(), then
+ * you must also invoke this rcu_head_init() function on it just after
+ * allocating that structure. Calls to this function must not race with
+ * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
+ */
+static inline void rcu_head_init(struct rcu_head *rhp)
+{
+ rhp->func = (void *)~0L;
+}
+
+static inline bool
+rcu_head_after_call_rcu(struct rcu_head *rhp,
+ void (*f)(struct rcu_head *head))
+{
+ void (*func)(struct rcu_head *head) = READ_ONCE(rhp->func);
+
+ if (func == f)
+ return true;
+ return false;
+}
+
#endif /* __TOOLS_LINUX_RCUPDATE_H */
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
new file mode 100644
index 00000000..57467cbf
--- /dev/null
+++ b/include/linux/rhashtable-types.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Resizable, Scalable, Concurrent Hash Table
+ *
+ * Simple structures that might be needed in include
+ * files.
+ */
+
+#ifndef _LINUX_RHASHTABLE_TYPES_H
+#define _LINUX_RHASHTABLE_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+struct rhash_head {
+ struct rhash_head __rcu *next;
+};
+
+struct rhlist_head {
+ struct rhash_head rhead;
+ struct rhlist_head __rcu *next;
+};
+
+struct bucket_table;
+
+/**
+ * struct rhashtable_compare_arg - Key for the function rhashtable_compare
+ * @ht: Hash table
+ * @key: Key to compare against
+ */
+struct rhashtable_compare_arg {
+ struct rhashtable *ht;
+ const void *key;
+};
+
+typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
+ const void *obj);
+
+/**
+ * struct rhashtable_params - Hash table construction parameters
+ * @nelem_hint: Hint on number of elements, should be 75% of desired size
+ * @key_len: Length of key
+ * @key_offset: Offset of key in struct to be hashed
+ * @head_offset: Offset of rhash_head in struct to be hashed
+ * @max_size: Maximum size while expanding
+ * @min_size: Minimum size while shrinking
+ * @automatic_shrinking: Enable automatic shrinking of tables
+ * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
+ * @obj_hashfn: Function to hash object
+ * @obj_cmpfn: Function to compare key with object
+ */
+struct rhashtable_params {
+ u16 nelem_hint;
+ u16 key_len;
+ u16 key_offset;
+ u16 head_offset;
+ unsigned int max_size;
+ u16 min_size;
+ bool automatic_shrinking;
+ rht_hashfn_t hashfn;
+ rht_obj_hashfn_t obj_hashfn;
+ rht_obj_cmpfn_t obj_cmpfn;
+};
+
+/**
+ * struct rhashtable - Hash table handle
+ * @tbl: Bucket table
+ * @key_len: Key length for hashfn
+ * @max_elems: Maximum number of elements in table
+ * @p: Configuration parameters
+ * @rhlist: True if this is an rhltable
+ * @run_work: Deferred worker to expand/shrink asynchronously
+ * @mutex: Mutex to protect current/future table swapping
+ * @lock: Spin lock to protect walker list
+ * @nelems: Number of elements in table
+ */
+struct rhashtable {
+ struct bucket_table __rcu *tbl;
+ unsigned int key_len;
+ unsigned int max_elems;
+ struct rhashtable_params p;
+ bool rhlist;
+ struct work_struct run_work;
+ struct mutex mutex;
+ spinlock_t lock;
+ atomic_t nelems;
+};
+
+/**
+ * struct rhltable - Hash table with duplicate objects in a list
+ * @ht: Underlying rhtable
+ */
+struct rhltable {
+ struct rhashtable ht;
+};
+
+/**
+ * struct rhashtable_walker - Hash table walker
+ * @list: List entry on list of walkers
+ * @tbl: The table that we were walking over
+ */
+struct rhashtable_walker {
+ struct list_head list;
+ struct bucket_table *tbl;
+};
+
+/**
+ * struct rhashtable_iter - Hash table iterator
+ * @ht: Table to iterate through
+ * @p: Current pointer
+ * @list: Current hash list pointer
+ * @walker: Associated rhashtable walker
+ * @slot: Current slot
+ * @skip: Number of entries to skip in slot
+ */
+struct rhashtable_iter {
+ struct rhashtable *ht;
+ struct rhash_head *p;
+ struct rhlist_head *list;
+ struct rhashtable_walker walker;
+ unsigned int slot;
+ unsigned int skip;
+ bool end_of_table;
+};
+
+int rhashtable_init(struct rhashtable *ht,
+ const struct rhashtable_params *params);
+int rhltable_init(struct rhltable *hlt,
+ const struct rhashtable_params *params);
+
+#endif /* _LINUX_RHASHTABLE_TYPES_H */
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 8dbe1533..6cf8c257 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Resizable, Scalable, Concurrent Hash Table
*
- * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
* Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
* Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
*
@@ -17,92 +18,93 @@
#ifndef _LINUX_RHASHTABLE_H
#define _LINUX_RHASHTABLE_H
-#include <linux/atomic.h>
-#include <linux/cache.h>
-#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/jhash.h>
-#include <linux/workqueue.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
+#include <linux/list_nulls.h>
#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+#include <linux/bit_spinlock.h>
-#define RHT_BASE_BITS 4
-#define RHT_HASH_BITS 27
-#define RHT_BASE_SHIFT RHT_HASH_BITS
-#define RHT_HASH_RESERVED_SPACE (RHT_BASE_BITS + 1)
+#define BIT(nr) (1UL << (nr))
-struct rhash_head {
- struct rhash_head __rcu *next;
-};
+#include <linux/rhashtable-types.h>
+/*
+ * Objects in an rhashtable have an embedded struct rhash_head
+ * which is linked into as hash chain from the hash table - or one
+ * of two or more hash tables when the rhashtable is being resized.
+ * The end of the chain is marked with a special nulls marks which has
+ * the least significant bit set but otherwise stores the address of
+ * the hash bucket. This allows us to be sure we've found the end
+ * of the right list.
+ * The value stored in the hash bucket has BIT(0) used as a lock bit.
+ * This bit must be atomically set before any changes are made to
+ * the chain. To avoid dereferencing this pointer without clearing
+ * the bit first, we use an opaque 'struct rhash_lock_head *' for the
+ * pointer stored in the bucket. This struct needs to be defined so
+ * that rcu_dereference() works on it, but it has no content so a
+ * cast is needed for it to be useful. This ensures it isn't
+ * used by mistake with clearing the lock bit first.
+ */
+struct rhash_lock_head {};
+/* Maximum chain length before rehash
+ *
+ * The maximum (not average) chain length grows with the size of the hash
+ * table, at a rate of (log N)/(log log N).
+ *
+ * The value of 16 is selected so that even if the hash table grew to
+ * 2^32 you would not expect the maximum chain length to exceed it
+ * unless we are under attack (or extremely unlucky).
+ *
+ * As this limit is only to detect attacks, we don't need to set it to a
+ * lower value as you'd need the chain length to vastly exceed 16 to have
+ * any real effect on the system.
+ */
+#define RHT_ELASTICITY 16u
+
+/**
+ * struct bucket_table - Table of hash buckets
+ * @size: Number of hash buckets
+ * @nest: Number of bits of first-level nested table.
+ * @rehash: Current bucket being rehashed
+ * @hash_rnd: Random seed to fold into hash
+ * @walkers: List of active walkers
+ * @rcu: RCU structure for freeing the table
+ * @future_tbl: Table under construction during rehashing
+ * @ntbl: Nested table used when out of memory.
+ * @buckets: size * hash buckets
+ */
struct bucket_table {
unsigned int size;
- unsigned int rehash;
+ unsigned int nest;
u32 hash_rnd;
- unsigned int locks_mask;
- spinlock_t *locks;
struct list_head walkers;
struct rcu_head rcu;
struct bucket_table __rcu *future_tbl;
- struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
-};
-
-struct rhashtable_compare_arg {
- struct rhashtable *ht;
- const void *key;
+ struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
};
-typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
- const void *obj);
-
-struct rhashtable_params {
- size_t nelem_hint;
- size_t key_len;
- size_t key_offset;
- size_t head_offset;
- unsigned int insecure_max_entries;
- unsigned int max_size;
- unsigned int min_size;
- u32 nulls_base;
- bool insecure_elasticity;
- bool automatic_shrinking;
- size_t locks_mul;
- rht_hashfn_t hashfn;
- rht_obj_hashfn_t obj_hashfn;
- rht_obj_cmpfn_t obj_cmpfn;
-};
-
-struct rhashtable {
- struct bucket_table __rcu *tbl;
- atomic_t nelems;
- unsigned int key_len;
- unsigned int elasticity;
- struct rhashtable_params p;
- struct work_struct run_work;
- struct mutex mutex;
- spinlock_t lock;
-};
-
-struct rhashtable_walker {
- struct list_head list;
- struct bucket_table *tbl;
-};
-
-#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
-
-static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
-{
- return NULLS_MARKER(ht->p.nulls_base + hash);
-}
-
-#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
- ((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+/*
+ * NULLS_MARKER() expects a hash value with the low
+ * bits mostly likely to be significant, and it discards
+ * the msb.
+ * We give it an address, in which the bottom bit is
+ * always 0, and the msb might be significant.
+ * So we shift the address down one bit to align with
+ * expectations and avoid losing a significant bit.
+ *
+ * We never store the NULLS_MARKER in the hash table
+ * itself as we need the lsb for locking.
+ * Instead we store a NULL
+ */
+#define RHT_NULLS_MARKER(ptr) \
+ ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
+#define INIT_RHT_NULLS_HEAD(ptr) \
+ ((ptr) = NULL)
static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
{
@@ -118,37 +120,45 @@ static inline void *rht_obj(const struct rhashtable *ht,
static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
unsigned int hash)
{
- return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1);
+ return hash & (tbl->size - 1);
}
-static inline unsigned int rht_key_hashfn(
- struct rhashtable *ht, const struct bucket_table *tbl,
- const void *key, const struct rhashtable_params params)
+static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
+ const void *key, const struct rhashtable_params params,
+ unsigned int hash_rnd)
{
unsigned int hash;
/* params must be equal to ht->p if it isn't constant. */
if (!__builtin_constant_p(params.key_len))
- hash = ht->p.hashfn(key, ht->key_len, tbl->hash_rnd);
+ hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
else if (params.key_len) {
unsigned int key_len = params.key_len;
if (params.hashfn)
- hash = params.hashfn(key, key_len, tbl->hash_rnd);
+ hash = params.hashfn(key, key_len, hash_rnd);
else if (key_len & (sizeof(u32) - 1))
- hash = jhash(key, key_len, tbl->hash_rnd);
+ hash = jhash(key, key_len, hash_rnd);
else
- hash = jhash2(key, key_len / sizeof(u32),
- tbl->hash_rnd);
+ hash = jhash2(key, key_len / sizeof(u32), hash_rnd);
} else {
unsigned int key_len = ht->p.key_len;
if (params.hashfn)
- hash = params.hashfn(key, key_len, tbl->hash_rnd);
+ hash = params.hashfn(key, key_len, hash_rnd);
else
- hash = jhash(key, key_len, tbl->hash_rnd);
+ hash = jhash(key, key_len, hash_rnd);
}
+ return hash;
+}
+
+static inline unsigned int rht_key_hashfn(
+ struct rhashtable *ht, const struct bucket_table *tbl,
+ const void *key, const struct rhashtable_params params)
+{
+ unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd);
+
return rht_bucket_index(tbl, hash);
}
@@ -165,6 +175,11 @@ static inline unsigned int rht_head_hashfn(
rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
}
+/**
+ * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
+ * @ht: hash table
+ * @tbl: current table
+ */
static inline bool rht_grow_above_75(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
@@ -173,6 +188,11 @@ static inline bool rht_grow_above_75(const struct rhashtable *ht,
(!ht->p.max_size || tbl->size < ht->p.max_size);
}
+/**
+ * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
+ * @ht: hash table
+ * @tbl: current table
+ */
static inline bool rht_shrink_below_30(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
@@ -181,6 +201,11 @@ static inline bool rht_shrink_below_30(const struct rhashtable *ht,
tbl->size > ht->p.min_size;
}
+/**
+ * rht_grow_above_100 - returns true if nelems > table-size
+ * @ht: hash table
+ * @tbl: current table
+ */
static inline bool rht_grow_above_100(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
@@ -188,62 +213,353 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht,
(!ht->p.max_size || tbl->size < ht->p.max_size);
}
+/**
+ * rht_grow_above_max - returns true if table is above maximum
+ * @ht: hash table
+ * @tbl: current table
+ */
static inline bool rht_grow_above_max(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
- return ht->p.insecure_max_entries &&
- atomic_read(&ht->nelems) >= ht->p.insecure_max_entries;
+ return atomic_read(&ht->nelems) >= ht->max_elems;
}
-static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl,
- unsigned int hash)
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rht_mutex_is_held(struct rhashtable *ht);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
+#else
+static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
- return &tbl->locks[hash & tbl->locks_mask];
+ return 1;
}
-int rhashtable_insert_rehash(struct rhashtable *, struct bucket_table *);
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *,
- const void *,
- struct rhash_head *,
- struct bucket_table *);
+static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
+ u32 hash)
+{
+ return 1;
+}
+#endif /* CONFIG_PROVE_LOCKING */
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+ struct rhash_head *obj);
-int rhashtable_init(struct rhashtable *, const struct rhashtable_params *);
-void rhashtable_destroy(struct rhashtable *);
+void rhashtable_walk_enter(struct rhashtable *ht,
+ struct rhashtable_iter *iter);
+void rhashtable_walk_exit(struct rhashtable_iter *iter);
+int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);
-#define rht_dereference(p, ht) rcu_dereference(p)
-#define rht_dereference_rcu(p, ht) rcu_dereference(p)
-#define rht_dereference_bucket(p, tbl, hash) rcu_dereference(p)
-#define rht_dereference_bucket_rcu(p, tbl, hash) rcu_dereference(p)
+static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
+{
+ (void)rhashtable_walk_start_check(iter);
+}
+
+void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void *rhashtable_walk_peek(struct rhashtable_iter *iter);
+void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
+
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+ void (*free_fn)(void *ptr, void *arg),
+ void *arg);
+void rhashtable_destroy(struct rhashtable *ht);
+
+struct rhash_lock_head __rcu **rht_bucket_nested(
+ const struct bucket_table *tbl, unsigned int hash);
+struct rhash_lock_head __rcu **__rht_bucket_nested(
+ const struct bucket_table *tbl, unsigned int hash);
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(
+ struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);
+
+#define rht_dereference(p, ht) \
+ rcu_dereference(p)
+
+#define rht_dereference_rcu(p, ht) \
+ rcu_dereference(p)
+
+#define rht_dereference_bucket(p, tbl, hash) \
+ rcu_dereference(p)
+
+#define rht_dereference_bucket_rcu(p, tbl, hash) \
+ rcu_dereference(p)
#define rht_entry(tpos, pos, member) \
({ tpos = container_of(pos, typeof(*tpos), member); 1; })
-#define rht_for_each_continue(pos, head, tbl, hash) \
- for (pos = rht_dereference_bucket(head, tbl, hash); \
- !rht_is_a_nulls(pos); \
+static inline struct rhash_lock_head __rcu *const *rht_bucket(
+ const struct bucket_table *tbl, unsigned int hash)
+{
+ return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
+ &tbl->buckets[hash];
+}
+
+static inline struct rhash_lock_head __rcu **rht_bucket_var(
+ struct bucket_table *tbl, unsigned int hash)
+{
+ return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
+ &tbl->buckets[hash];
+}
+
+static inline struct rhash_lock_head __rcu **rht_bucket_insert(
+ struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+ return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
+ &tbl->buckets[hash];
+}
+
+/*
+ * We lock a bucket by setting BIT(0) in the pointer - this is always
+ * zero in real pointers. The NULLS mark is never stored in the bucket,
+ * rather we store NULL if the bucket is empty.
+ * bit_spin_locks do not handle contention well, but the whole point
+ * of the hashtable design is to achieve minimum per-bucket contention.
+ * A nested hash table might not have a bucket pointer. In that case
+ * we cannot get a lock. For remove and replace the bucket cannot be
+ * interesting and doesn't need locking.
+ * For insert we allocate the bucket if this is the last bucket_table,
+ * and then take the lock.
+ * Sometimes we unlock a bucket by writing a new pointer there. In that
+ * case we don't need to unlock, but we do need to reset state such as
+ * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer()
+ * provides the same release semantics that bit_spin_unlock() provides,
+ * this is safe.
+ * When we write to a bucket without unlocking, we use rht_assign_locked().
+ */
+
+static inline void rht_lock(struct bucket_table *tbl,
+ struct rhash_lock_head __rcu **bkt)
+{
+ bit_spin_lock(0, (unsigned long *)bkt);
+}
+
+static inline void rht_lock_nested(struct bucket_table *tbl,
+ struct rhash_lock_head __rcu **bucket,
+ unsigned int subclass)
+{
+ bit_spin_lock(0, (unsigned long *)bucket);
+}
+
+static inline void rht_unlock(struct bucket_table *tbl,
+ struct rhash_lock_head __rcu **bkt)
+{
+ bit_spin_unlock(0, (unsigned long *)bkt);
+}
+
+static inline struct rhash_head *__rht_ptr(
+ struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
+{
+ return (struct rhash_head *)
+ ((unsigned long)p & ~BIT(0) ?:
+ (unsigned long)RHT_NULLS_MARKER(bkt));
+}
+
+/*
+ * Where 'bkt' is a bucket and might be locked:
+ * rht_ptr_rcu() dereferences that pointer and clears the lock bit.
+ * rht_ptr() dereferences in a context where the bucket is locked.
+ * rht_ptr_exclusive() dereferences in a context where exclusive
+ * access is guaranteed, such as when destroying the table.
+ */
+static inline struct rhash_head *rht_ptr_rcu(
+ struct rhash_lock_head __rcu *const *bkt)
+{
+ return __rht_ptr(rcu_dereference(*bkt), bkt);
+}
+
+static inline struct rhash_head *rht_ptr(
+ struct rhash_lock_head __rcu *const *bkt,
+ struct bucket_table *tbl,
+ unsigned int hash)
+{
+ return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
+}
+
+static inline struct rhash_head *rht_ptr_exclusive(
+ struct rhash_lock_head __rcu *const *bkt)
+{
+ return __rht_ptr(rcu_dereference(*bkt), bkt);
+}
+
+static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
+ struct rhash_head *obj)
+{
+ if (rht_is_a_nulls(obj))
+ obj = NULL;
+ rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0)));
+}
+
+static inline void rht_assign_unlock(struct bucket_table *tbl,
+ struct rhash_lock_head __rcu **bkt,
+ struct rhash_head *obj)
+{
+ if (rht_is_a_nulls(obj))
+ obj = NULL;
+ rcu_assign_pointer(*bkt, (void *)obj);
+ preempt_enable();
+ __release(bitlock);
+}
+
+/**
+ * rht_for_each_from - iterate over hash chain from given head
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @head: the &struct rhash_head to start from
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ */
+#define rht_for_each_from(pos, head, tbl, hash) \
+ for (pos = head; \
+ !rht_is_a_nulls(pos); \
pos = rht_dereference_bucket((pos)->next, tbl, hash))
+/**
+ * rht_for_each - iterate over hash chain
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ */
#define rht_for_each(pos, tbl, hash) \
- rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash)
+ rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
+ tbl, hash)
+
+/**
+ * rht_for_each_entry_from - iterate over hash chain from given head
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @head: the &struct rhash_head to start from
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ * @member: name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \
+ for (pos = head; \
+ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \
+ pos = rht_dereference_bucket((pos)->next, tbl, hash))
-#define rht_for_each_rcu_continue(pos, head, tbl, hash) \
+/**
+ * rht_for_each_entry - iterate over hash chain of given type
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ * @member: name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry(tpos, pos, tbl, hash, member) \
+ rht_for_each_entry_from(tpos, pos, \
+ rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
+ tbl, hash, member)
+
+/**
+ * rht_for_each_entry_safe - safely iterate over hash chain of given type
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @next: the &struct rhash_head to use as next in loop cursor.
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ * @member: name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive allows for the looped code to
+ * remove the loop cursor from the list.
+ */
+#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \
+ for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
+ next = !rht_is_a_nulls(pos) ? \
+ rht_dereference_bucket(pos->next, tbl, hash) : NULL; \
+ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \
+ pos = next, \
+ next = !rht_is_a_nulls(pos) ? \
+ rht_dereference_bucket(pos->next, tbl, hash) : NULL)
+
+/**
+ * rht_for_each_rcu_from - iterate over rcu hash chain from given head
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @head: the &struct rhash_head to start from
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu_from(pos, head, tbl, hash) \
for (({barrier(); }), \
- pos = rht_dereference_bucket_rcu(head, tbl, hash); \
+ pos = head; \
!rht_is_a_nulls(pos); \
pos = rcu_dereference_raw(pos->next))
-#define rht_for_each_rcu(pos, tbl, hash) \
- rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash)
+/**
+ * rht_for_each_rcu - iterate over rcu hash chain
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu(pos, tbl, hash) \
+ for (({barrier(); }), \
+ pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \
+ !rht_is_a_nulls(pos); \
+ pos = rcu_dereference_raw(pos->next))
-#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
+/**
+ * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @head: the &struct rhash_head to start from
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ * @member: name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
for (({barrier(); }), \
- pos = rht_dereference_bucket_rcu(head, tbl, hash); \
+ pos = head; \
(!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \
pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
-#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \
- rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
- tbl, hash, member)
+/**
+ * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ * @member: name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \
+ rht_for_each_entry_rcu_from(tpos, pos, \
+ rht_ptr_rcu(rht_bucket(tbl, hash)), \
+ tbl, hash, member)
+
+/**
+ * rhl_for_each_rcu - iterate over rcu hash table list
+ * @pos: the &struct rlist_head to use as a loop cursor.
+ * @list: the head of the list
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_rcu(pos, list) \
+ for (pos = list; pos; pos = rcu_dereference_raw(pos->next))
+
+/**
+ * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct rlist_head to use as a loop cursor.
+ * @list: the head of the list
+ * @member: name of the &struct rlist_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_entry_rcu(tpos, pos, list, member) \
+ for (pos = list; pos && rht_entry(tpos, pos, member); \
+ pos = rcu_dereference_raw(pos->next))
static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
const void *obj)
@@ -254,7 +570,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
}
-static inline void *rhashtable_lookup_fast(
+/* Internal function, do not use. */
+static inline struct rhash_head *__rhashtable_lookup(
struct rhashtable *ht, const void *key,
const struct rhashtable_params params)
{
@@ -262,23 +579,27 @@ static inline void *rhashtable_lookup_fast(
.ht = ht,
.key = key,
};
- const struct bucket_table *tbl;
+ struct rhash_lock_head __rcu *const *bkt;
+ struct bucket_table *tbl;
struct rhash_head *he;
unsigned int hash;
- rcu_read_lock();
-
tbl = rht_dereference_rcu(ht->tbl, ht);
restart:
hash = rht_key_hashfn(ht, tbl, key, params);
- rht_for_each_rcu(he, tbl, hash) {
- if (params.obj_cmpfn ?
- params.obj_cmpfn(&arg, rht_obj(ht, he)) :
- rhashtable_compare(&arg, rht_obj(ht, he)))
- continue;
- rcu_read_unlock();
- return rht_obj(ht, he);
- }
+ bkt = rht_bucket(tbl, hash);
+ do {
+ rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
+ if (params.obj_cmpfn ?
+ params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+ rhashtable_compare(&arg, rht_obj(ht, he)))
+ continue;
+ return he;
+ }
+ /* An object might have been moved to a different hash chain,
+ * while we walk along it - better check and retry.
+ */
+ } while (he != RHT_NULLS_MARKER(bkt));
/* Ensure we see any new tables. */
smp_rmb();
@@ -286,150 +607,594 @@ restart:
tbl = rht_dereference_rcu(tbl->future_tbl, ht);
if (unlikely(tbl))
goto restart;
- rcu_read_unlock();
return NULL;
}
-static inline int __rhashtable_insert_fast(
- struct rhashtable *ht, const void *key, struct rhash_head *obj,
+/**
+ * rhashtable_lookup - search hash table
+ * @ht: hash table
+ * @key: the pointer to the key
+ * @params: hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup(
+ struct rhashtable *ht, const void *key,
+ const struct rhashtable_params params)
+{
+ struct rhash_head *he = __rhashtable_lookup(ht, key, params);
+
+ return he ? rht_obj(ht, he) : NULL;
+}
+
+/**
+ * rhashtable_lookup_fast - search hash table, without RCU read lock
+ * @ht: hash table
+ * @key: the pointer to the key
+ * @params: hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * Only use this function when you have other mechanisms guaranteeing
+ * that the object won't go away after the RCU read lock is released.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup_fast(
+ struct rhashtable *ht, const void *key,
+ const struct rhashtable_params params)
+{
+ void *obj;
+
+ rcu_read_lock();
+ obj = rhashtable_lookup(ht, key, params);
+ rcu_read_unlock();
+
+ return obj;
+}
+
+/**
+ * rhltable_lookup - search hash list table
+ * @hlt: hash table
+ * @key: the pointer to the key
+ * @params: hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. All matching entries are returned
+ * in a list.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the list of entries that match the given key.
+ */
+static inline struct rhlist_head *rhltable_lookup(
+ struct rhltable *hlt, const void *key,
const struct rhashtable_params params)
{
+ struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);
+
+ return he ? container_of(he, struct rhlist_head, rhead) : NULL;
+}
+
+/* Internal function, please use rhashtable_insert_fast() instead. This
+ * function returns the existing element already in hashes in there is a clash,
+ * otherwise it returns an error via ERR_PTR().
+ */
+static inline void *__rhashtable_insert_fast(
+ struct rhashtable *ht, const void *key, struct rhash_head *obj,
+ const struct rhashtable_params params, bool rhlist)
+{
struct rhashtable_compare_arg arg = {
.ht = ht,
.key = key,
};
- struct bucket_table *tbl, *new_tbl;
+ struct rhash_lock_head __rcu **bkt;
+ struct rhash_head __rcu **pprev;
+ struct bucket_table *tbl;
struct rhash_head *head;
- spinlock_t *lock;
- unsigned int elasticity;
unsigned int hash;
- int err;
+ int elasticity;
+ void *data;
-restart:
rcu_read_lock();
tbl = rht_dereference_rcu(ht->tbl, ht);
+ hash = rht_head_hashfn(ht, tbl, obj, params);
+ elasticity = RHT_ELASTICITY;
+ bkt = rht_bucket_insert(ht, tbl, hash);
+ data = ERR_PTR(-ENOMEM);
+ if (!bkt)
+ goto out;
+ pprev = NULL;
+ rht_lock(tbl, bkt);
- /* All insertions must grab the oldest table containing
- * the hashed bucket that is yet to be rehashed.
- */
- for (;;) {
- hash = rht_head_hashfn(ht, tbl, obj, params);
- lock = rht_bucket_lock(tbl, hash);
- spin_lock_bh(lock);
+ if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
+slow_path:
+ rht_unlock(tbl, bkt);
+ rcu_read_unlock();
+ return rhashtable_insert_slow(ht, key, obj);
+ }
- if (tbl->rehash <= hash)
- break;
+ rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
+ struct rhlist_head *plist;
+ struct rhlist_head *list;
- spin_unlock_bh(lock);
- tbl = rht_dereference_rcu(tbl->future_tbl, ht);
- }
+ elasticity--;
+ if (!key ||
+ (params.obj_cmpfn ?
+ params.obj_cmpfn(&arg, rht_obj(ht, head)) :
+ rhashtable_compare(&arg, rht_obj(ht, head)))) {
+ pprev = &head->next;
+ continue;
+ }
- new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
- if (unlikely(new_tbl)) {
- tbl = rhashtable_insert_slow(ht, key, obj, new_tbl);
- if (!IS_ERR_OR_NULL(tbl))
- goto slow_path;
+ data = rht_obj(ht, head);
- err = PTR_ERR(tbl);
- goto out;
- }
+ if (!rhlist)
+ goto out_unlock;
- err = -E2BIG;
- if (unlikely(rht_grow_above_max(ht, tbl)))
- goto out;
- if (unlikely(rht_grow_above_100(ht, tbl))) {
-slow_path:
- spin_unlock_bh(lock);
- err = rhashtable_insert_rehash(ht, tbl);
- rcu_read_unlock();
- if (err)
- return err;
+ list = container_of(obj, struct rhlist_head, rhead);
+ plist = container_of(head, struct rhlist_head, rhead);
- goto restart;
+ RCU_INIT_POINTER(list->next, plist);
+ head = rht_dereference_bucket(head->next, tbl, hash);
+ RCU_INIT_POINTER(list->rhead.next, head);
+ if (pprev) {
+ rcu_assign_pointer(*pprev, obj);
+ rht_unlock(tbl, bkt);
+ } else
+ rht_assign_unlock(tbl, bkt, obj);
+ data = NULL;
+ goto out;
}
- err = -EEXIST;
- elasticity = ht->elasticity;
- rht_for_each(head, tbl, hash) {
- if (key &&
- unlikely(!(params.obj_cmpfn ?
- params.obj_cmpfn(&arg, rht_obj(ht, head)) :
- rhashtable_compare(&arg, rht_obj(ht, head)))))
- goto out;
- if (!--elasticity)
- goto slow_path;
- }
+ if (elasticity <= 0)
+ goto slow_path;
+
+ data = ERR_PTR(-E2BIG);
+ if (unlikely(rht_grow_above_max(ht, tbl)))
+ goto out_unlock;
- err = 0;
+ if (unlikely(rht_grow_above_100(ht, tbl)))
+ goto slow_path;
- head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+ /* Inserting at head of list makes unlocking free. */
+ head = rht_ptr(bkt, tbl, hash);
RCU_INIT_POINTER(obj->next, head);
+ if (rhlist) {
+ struct rhlist_head *list;
- rcu_assign_pointer(tbl->buckets[hash], obj);
+ list = container_of(obj, struct rhlist_head, rhead);
+ RCU_INIT_POINTER(list->next, NULL);
+ }
atomic_inc(&ht->nelems);
+ rht_assign_unlock(tbl, bkt, obj);
+
if (rht_grow_above_75(ht, tbl))
schedule_work(&ht->run_work);
+ data = NULL;
out:
- spin_unlock_bh(lock);
rcu_read_unlock();
- return err;
+ return data;
+
+out_unlock:
+ rht_unlock(tbl, bkt);
+ goto out;
}
+/**
+ * rhashtable_insert_fast - insert object into hash table
+ * @ht: hash table
+ * @obj: pointer to hash head inside object
+ * @params: hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhashtable_insert_fast(
+ struct rhashtable *ht, struct rhash_head *obj,
+ const struct rhashtable_params params)
+{
+ void *ret;
+
+ ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+
+ return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhltable_insert_key - insert object into hash list table
+ * @hlt: hash list table
+ * @key: the pointer to the key
+ * @list: pointer to hash list head inside object
+ * @params: hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhltable_insert_key(
+ struct rhltable *hlt, const void *key, struct rhlist_head *list,
+ const struct rhashtable_params params)
+{
+ return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+ params, true));
+}
+
+/**
+ * rhltable_insert - insert object into hash list table
+ * @hlt: hash list table
+ * @list: pointer to hash list head inside object
+ * @params: hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhltable_insert(
+ struct rhltable *hlt, struct rhlist_head *list,
+ const struct rhashtable_params params)
+{
+ const char *key = rht_obj(&hlt->ht, &list->rhead);
+
+ key += params.key_offset;
+
+ return rhltable_insert_key(hlt, key, list, params);
+}
+
+/**
+ * rhashtable_lookup_insert_fast - lookup and insert object into hash table
+ * @ht: hash table
+ * @obj: pointer to hash head inside object
+ * @params: hash table parameters
+ *
+ * This lookup function may only be used for fixed key hash table (key_len
+ * parameter set). It will BUG() if used inappropriately.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
static inline int rhashtable_lookup_insert_fast(
struct rhashtable *ht, struct rhash_head *obj,
const struct rhashtable_params params)
{
const char *key = rht_obj(ht, obj);
+ void *ret;
BUG_ON(ht->p.obj_hashfn);
- return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj,
- params);
+ ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+ false);
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+
+ return ret == NULL ? 0 : -EEXIST;
}
-static inline int __rhashtable_remove_fast(
+/**
+ * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
+ * @ht: hash table
+ * @obj: pointer to hash head inside object
+ * @params: hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_fast(), but this function returns the
+ * object if it exists, NULL if it did not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_fast(
+ struct rhashtable *ht, struct rhash_head *obj,
+ const struct rhashtable_params params)
+{
+ const char *key = rht_obj(ht, obj);
+
+ BUG_ON(ht->p.obj_hashfn);
+
+ return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+ false);
+}
+
+/**
+ * rhashtable_lookup_insert_key - search and insert object to hash table
+ * with explicit key
+ * @ht: hash table
+ * @key: key
+ * @obj: pointer to hash head inside object
+ * @params: hash table parameters
+ *
+ * Lookups may occur in parallel with hashtable mutations and resizing.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ *
+ * Returns zero on success.
+ */
+static inline int rhashtable_lookup_insert_key(
+ struct rhashtable *ht, const void *key, struct rhash_head *obj,
+ const struct rhashtable_params params)
+{
+ void *ret;
+
+ BUG_ON(!ht->p.obj_hashfn || !key);
+
+ ret = __rhashtable_insert_fast(ht, key, obj, params, false);
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+
+ return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
+ * @ht: hash table
+ * @key: key
+ * @obj: pointer to hash head inside object
+ * @params: hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_key(), but this function returns the
+ * object if it exists, NULL if it does not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_key(
+ struct rhashtable *ht, const void *key, struct rhash_head *obj,
+ const struct rhashtable_params params)
+{
+ BUG_ON(!ht->p.obj_hashfn || !key);
+
+ return __rhashtable_insert_fast(ht, key, obj, params, false);
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast_one(
struct rhashtable *ht, struct bucket_table *tbl,
- struct rhash_head *obj, const struct rhashtable_params params)
+ struct rhash_head *obj, const struct rhashtable_params params,
+ bool rhlist)
{
+ struct rhash_lock_head __rcu **bkt;
struct rhash_head __rcu **pprev;
struct rhash_head *he;
- spinlock_t * lock;
unsigned int hash;
int err = -ENOENT;
hash = rht_head_hashfn(ht, tbl, obj, params);
- lock = rht_bucket_lock(tbl, hash);
+ bkt = rht_bucket_var(tbl, hash);
+ if (!bkt)
+ return -ENOENT;
+ pprev = NULL;
+ rht_lock(tbl, bkt);
- spin_lock_bh(lock);
+ rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+ struct rhlist_head *list;
+
+ list = container_of(he, struct rhlist_head, rhead);
- pprev = &tbl->buckets[hash];
- rht_for_each(he, tbl, hash) {
if (he != obj) {
+ struct rhlist_head __rcu **lpprev;
+
pprev = &he->next;
- continue;
+
+ if (!rhlist)
+ continue;
+
+ do {
+ lpprev = &list->next;
+ list = rht_dereference_bucket(list->next,
+ tbl, hash);
+ } while (list && obj != &list->rhead);
+
+ if (!list)
+ continue;
+
+ list = rht_dereference_bucket(list->next, tbl, hash);
+ RCU_INIT_POINTER(*lpprev, list);
+ err = 0;
+ break;
}
- rcu_assign_pointer(*pprev, obj->next);
+ obj = rht_dereference_bucket(obj->next, tbl, hash);
+ err = 1;
+
+ if (rhlist) {
+ list = rht_dereference_bucket(list->next, tbl, hash);
+ if (list) {
+ RCU_INIT_POINTER(list->rhead.next, obj);
+ obj = &list->rhead;
+ err = 0;
+ }
+ }
+
+ if (pprev) {
+ rcu_assign_pointer(*pprev, obj);
+ rht_unlock(tbl, bkt);
+ } else {
+ rht_assign_unlock(tbl, bkt, obj);
+ }
+ goto unlocked;
+ }
+
+ rht_unlock(tbl, bkt);
+unlocked:
+ if (err > 0) {
+ atomic_dec(&ht->nelems);
+ if (unlikely(ht->p.automatic_shrinking &&
+ rht_shrink_below_30(ht, tbl)))
+ schedule_work(&ht->run_work);
err = 0;
- break;
}
- spin_unlock_bh(lock);
+ return err;
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast(
+ struct rhashtable *ht, struct rhash_head *obj,
+ const struct rhashtable_params params, bool rhlist)
+{
+ struct bucket_table *tbl;
+ int err;
+
+ rcu_read_lock();
+
+ tbl = rht_dereference_rcu(ht->tbl, ht);
+
+ /* Because we have already taken (and released) the bucket
+ * lock in old_tbl, if we find that future_tbl is not yet
+ * visible then that guarantees the entry to still be in
+ * the old tbl if it exists.
+ */
+ while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
+ rhlist)) &&
+ (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
+ ;
+
+ rcu_read_unlock();
return err;
}
+/**
+ * rhashtable_remove_fast - remove object from hash table
+ * @ht: hash table
+ * @obj: pointer to hash head inside object
+ * @params: hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
static inline int rhashtable_remove_fast(
struct rhashtable *ht, struct rhash_head *obj,
const struct rhashtable_params params)
{
+ return __rhashtable_remove_fast(ht, obj, params, false);
+}
+
+/**
+ * rhltable_remove - remove object from hash list table
+ * @hlt: hash list table
+ * @list: pointer to hash list head inside object
+ * @params: hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
+static inline int rhltable_remove(
+ struct rhltable *hlt, struct rhlist_head *list,
+ const struct rhashtable_params params)
+{
+ return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
+}
+
+/* Internal function, please use rhashtable_replace_fast() instead */
+static inline int __rhashtable_replace_fast(
+ struct rhashtable *ht, struct bucket_table *tbl,
+ struct rhash_head *obj_old, struct rhash_head *obj_new,
+ const struct rhashtable_params params)
+{
+ struct rhash_lock_head __rcu **bkt;
+ struct rhash_head __rcu **pprev;
+ struct rhash_head *he;
+ unsigned int hash;
+ int err = -ENOENT;
+
+ /* Minimally, the old and new objects must have same hash
+ * (which should mean identifiers are the same).
+ */
+ hash = rht_head_hashfn(ht, tbl, obj_old, params);
+ if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
+ return -EINVAL;
+
+ bkt = rht_bucket_var(tbl, hash);
+ if (!bkt)
+ return -ENOENT;
+
+ pprev = NULL;
+ rht_lock(tbl, bkt);
+
+ rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+ if (he != obj_old) {
+ pprev = &he->next;
+ continue;
+ }
+
+ rcu_assign_pointer(obj_new->next, obj_old->next);
+ if (pprev) {
+ rcu_assign_pointer(*pprev, obj_new);
+ rht_unlock(tbl, bkt);
+ } else {
+ rht_assign_unlock(tbl, bkt, obj_new);
+ }
+ err = 0;
+ goto unlocked;
+ }
+
+ rht_unlock(tbl, bkt);
+
+unlocked:
+ return err;
+}
+
+/**
+ * rhashtable_replace_fast - replace an object in hash table
+ * @ht: hash table
+ * @obj_old: pointer to hash head inside object being replaced
+ * @obj_new: pointer to hash head inside object which is new
+ * @params: hash table parameters
+ *
+ * Replacing an object doesn't affect the number of elements in the hash table
+ * or bucket, so we don't need to worry about shrinking or expanding the
+ * table here.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found,
+ * -EINVAL if hash is not the same for the old and new objects.
+ */
+static inline int rhashtable_replace_fast(
+ struct rhashtable *ht, struct rhash_head *obj_old,
+ struct rhash_head *obj_new,
+ const struct rhashtable_params params)
+{
struct bucket_table *tbl;
int err;
@@ -442,22 +1207,62 @@ static inline int rhashtable_remove_fast(
* visible then that guarantees the entry to still be in
* the old tbl if it exists.
*/
- while ((err = __rhashtable_remove_fast(ht, tbl, obj, params)) &&
+ while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
+ obj_new, params)) &&
(tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
;
- if (err)
- goto out;
-
- atomic_dec(&ht->nelems);
- if (unlikely(ht->p.automatic_shrinking &&
- rht_shrink_below_30(ht, tbl)))
- schedule_work(&ht->run_work);
-
-out:
rcu_read_unlock();
return err;
}
+/**
+ * rhltable_walk_enter - Initialise an iterator
+ * @hlt: Table to walk over
+ * @iter: Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice. Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+static inline void rhltable_walk_enter(struct rhltable *hlt,
+ struct rhashtable_iter *iter)
+{
+ return rhashtable_walk_enter(&hlt->ht, iter);
+}
+
+/**
+ * rhltable_free_and_destroy - free elements and destroy hash list table
+ * @hlt: the hash list table to destroy
+ * @free_fn: callback to release resources of element
+ * @arg: pointer passed to free_fn
+ *
+ * See documentation for rhashtable_free_and_destroy.
+ */
+static inline void rhltable_free_and_destroy(struct rhltable *hlt,
+ void (*free_fn)(void *ptr,
+ void *arg),
+ void *arg)
+{
+ return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
+}
+
+static inline void rhltable_destroy(struct rhltable *hlt)
+{
+ return rhltable_free_and_destroy(hlt, NULL, NULL);
+}
+
#endif /* _LINUX_RHASHTABLE_H */
diff --git a/include/linux/six.h b/include/linux/six.h
index 0e6df059..477c33eb 100644
--- a/include/linux/six.h
+++ b/include/linux/six.h
@@ -196,6 +196,7 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
void six_lock_wakeup_all(struct six_lock *);
+void six_lock_pcpu_free_rcu(struct six_lock *);
void six_lock_pcpu_free(struct six_lock *);
void six_lock_pcpu_alloc(struct six_lock *);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index b8a1235b..775b7e3a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -66,6 +66,7 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags)
#define kzfree(p) free(p)
#define kvmalloc(size, flags) kmalloc(size, flags)
+#define kvzalloc(size, flags) kzalloc(size, flags)
#define kvfree(p) kfree(p)
static inline struct page *alloc_pages(gfp_t flags, unsigned int order)
diff --git a/include/linux/types.h b/include/linux/types.h
index 1e125550..c9886cba 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -11,6 +11,8 @@
#define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */
#include <asm/types.h>
+#include <linux/cache.h>
+
#define BITS_PER_LONG __BITS_PER_LONG
struct page;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 532f23b9..cb225951 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -138,19 +138,18 @@ struct bpos {
#define KEY_SNAPSHOT_MAX ((__u32)~0U)
#define KEY_SIZE_MAX ((__u32)~0U)
-static inline struct bpos POS(__u64 inode, __u64 offset)
+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
{
- struct bpos ret;
-
- ret.inode = inode;
- ret.offset = offset;
- ret.snapshot = 0;
-
- return ret;
+ return (struct bpos) {
+ .inode = inode,
+ .offset = offset,
+ .snapshot = snapshot,
+ };
}
-#define POS_MIN POS(0, 0)
-#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
+#define POS_MIN SPOS(0, 0, 0)
+#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS(_inode, _offset) SPOS(_inode, _offset, 0)
/* Empty placeholder struct, for container_of() */
struct bch_val {
@@ -707,7 +706,9 @@ struct bch_inode_generation {
x(bi_foreground_target, 16) \
x(bi_background_target, 16) \
x(bi_erasure_code, 16) \
- x(bi_fields_set, 16)
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
@@ -743,6 +744,7 @@ enum {
__BCH_INODE_I_SIZE_DIRTY= 5,
__BCH_INODE_I_SECTORS_DIRTY= 6,
__BCH_INODE_UNLINKED = 7,
+ __BCH_INODE_BACKPTR_UNTRUSTED = 8,
/* bits 20+ reserved for packed fields below: */
};
@@ -755,6 +757,7 @@ enum {
#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY)
#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED)
+#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
@@ -1204,7 +1207,9 @@ enum bcachefs_metadata_version {
bcachefs_metadata_version_new_versioning = 10,
bcachefs_metadata_version_bkey_renumber = 10,
bcachefs_metadata_version_inode_btree_change = 11,
- bcachefs_metadata_version_max = 12,
+ bcachefs_metadata_version_snapshot = 12,
+ bcachefs_metadata_version_inode_backpointers = 13,
+ bcachefs_metadata_version_max = 14,
};
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
@@ -1736,7 +1741,7 @@ struct btree_node {
/* Closed interval: */
struct bpos min_key;
struct bpos max_key;
- struct bch_extent_ptr ptr;
+ struct bch_extent_ptr _ptr; /* not used anymore */
struct bkey_format format;
union {
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index e1906f25..3af56062 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -614,15 +614,19 @@ const char *bch2_bkey_format_validate(struct bkey_format *f)
return "incorrect number of fields";
for (i = 0; i < f->nr_fields; i++) {
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
u64 field_offset = le64_to_cpu(f->field_offset[i]);
- if (f->bits_per_field[i] > 64)
+ if (f->bits_per_field[i] > unpacked_bits)
return "field too large";
- if (field_offset &&
- (f->bits_per_field[i] == 64 ||
- (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
- field_offset)))
+ if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+ return "offset + bits overflow";
+
+ if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+ unpacked_mask) <
+ field_offset)
return "offset + bits overflow";
bits += f->bits_per_field[i];
@@ -1045,7 +1049,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
high_word(f, r),
b->nr_key_bits);
- EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l),
+ EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
bkey_unpack_pos(b, r)));
return ret;
}
@@ -1055,7 +1059,7 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
const struct bkey_packed *l,
const struct bpos *r)
{
- return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+ return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
}
__pure __flatten
@@ -1076,7 +1080,7 @@ int bch2_bkey_cmp_packed(const struct btree *b,
r = (void*) &unpacked;
}
- return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+ return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
}
__pure __flatten
@@ -1087,7 +1091,7 @@ int __bch2_bkey_cmp_left_packed(const struct btree *b,
const struct bkey *l_unpacked;
return unlikely(l_unpacked = packed_to_bkey_c(l))
- ? bkey_cmp(l_unpacked->p, *r)
+ ? bpos_cmp(l_unpacked->p, *r)
: __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
}
@@ -1123,11 +1127,12 @@ void bch2_bkey_pack_test(void)
struct bkey_packed p;
struct bkey_format test_format = {
- .key_u64s = 2,
+ .key_u64s = 3,
.nr_fields = BKEY_NR_FIELDS,
.bits_per_field = {
13,
64,
+ 32,
},
};
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index 629288a6..2e45d88f 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -33,16 +33,6 @@ struct bkey_s {
#define bkey_next(_k) vstruct_next(_k)
-static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k,
- struct bkey_packed *end)
-{
- k = bkey_next(k);
-
- while (k != end && !k->u64s)
- k = (void *) ((u64 *) k + 1);
- return k;
-}
-
#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
static inline size_t bkey_val_bytes(const struct bkey *k)
@@ -150,29 +140,27 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b,
return bkey_cmp_left_packed(b, l, &r);
}
-#if 1
+static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
+{
+ return cmp_int(l.inode, r.inode) ?:
+ cmp_int(l.offset, r.offset) ?:
+ cmp_int(l.snapshot, r.snapshot);
+}
+
static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
{
- if (l.inode != r.inode)
- return l.inode < r.inode ? -1 : 1;
- if (l.offset != r.offset)
- return l.offset < r.offset ? -1 : 1;
- if (l.snapshot != r.snapshot)
- return l.snapshot < r.snapshot ? -1 : 1;
- return 0;
+ return cmp_int(l.inode, r.inode) ?:
+ cmp_int(l.offset, r.offset);
}
-#else
-int bkey_cmp(struct bpos l, struct bpos r);
-#endif
static inline struct bpos bpos_min(struct bpos l, struct bpos r)
{
- return bkey_cmp(l, r) < 0 ? l : r;
+ return bpos_cmp(l, r) < 0 ? l : r;
}
static inline struct bpos bpos_max(struct bpos l, struct bpos r)
{
- return bkey_cmp(l, r) > 0 ? l : r;
+ return bpos_cmp(l, r) > 0 ? l : r;
}
#define sbb(a, b, borrow) \
@@ -200,7 +188,7 @@ static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
{
- if (bkey_cmp(l, r) > 0)
+ if (bpos_cmp(l, r) > 0)
swap(l, r);
return bpos_sub(r, l);
@@ -262,24 +250,46 @@ static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
format->bits_per_field[BKEY_FIELD_SNAPSHOT];
}
-static inline struct bpos bkey_successor(struct bpos p)
+static inline struct bpos bpos_successor(struct bpos p)
{
- struct bpos ret = p;
+ if (!++p.snapshot &&
+ !++p.offset &&
+ !++p.inode)
+ BUG();
- if (!++ret.offset)
- BUG_ON(!++ret.inode);
+ return p;
+}
- return ret;
+static inline struct bpos bpos_predecessor(struct bpos p)
+{
+ if (!p.snapshot-- &&
+ !p.offset-- &&
+ !p.inode--)
+ BUG();
+
+ return p;
}
-static inline struct bpos bkey_predecessor(struct bpos p)
+static inline struct bpos bpos_nosnap_successor(struct bpos p)
{
- struct bpos ret = p;
+ p.snapshot = 0;
- if (!ret.offset--)
- BUG_ON(!ret.inode--);
+ if (!++p.offset &&
+ !++p.inode)
+ BUG();
- return ret;
+ return p;
+}
+
+static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
+{
+ p.snapshot = 0;
+
+ if (!p.offset-- &&
+ !p.inode--)
+ BUG();
+
+ return p;
}
static inline u64 bkey_start_offset(const struct bkey *k)
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 641169ef..6fe95b80 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -119,10 +119,17 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
return "nonzero size field";
}
- if (k.k->p.snapshot)
+ if (type != BKEY_TYPE_btree &&
+ !btree_type_has_snapshots(type) &&
+ k.k->p.snapshot)
return "nonzero snapshot";
if (type != BKEY_TYPE_btree &&
+ btree_type_has_snapshots(type) &&
+ k.k->p.snapshot != U32_MAX)
+ return "invalid snapshot field";
+
+ if (type != BKEY_TYPE_btree &&
!bkey_cmp(k.k->p, POS_MAX))
return "POS_MAX key";
@@ -138,10 +145,10 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
{
- if (bkey_cmp(k.k->p, b->data->min_key) < 0)
+ if (bpos_cmp(k.k->p, b->data->min_key) < 0)
return "key before start of btree node";
- if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+ if (bpos_cmp(k.k->p, b->data->max_key) > 0)
return "key past end of btree node";
return NULL;
@@ -165,9 +172,9 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
{
- if (!bkey_cmp(pos, POS_MIN))
+ if (!bpos_cmp(pos, POS_MIN))
pr_buf(out, "POS_MIN");
- else if (!bkey_cmp(pos, POS_MAX))
+ else if (!bpos_cmp(pos, POS_MAX))
pr_buf(out, "POS_MAX");
else {
if (pos.inode == U64_MAX)
@@ -256,7 +263,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c,
!ops->key_merge ||
l.k->type != r.k->type ||
bversion_cmp(l.k->version, r.k->version) ||
- bkey_cmp(l.k->p, bkey_start_pos(r.k)))
+ bpos_cmp(l.k->p, bkey_start_pos(r.k)))
return BCH_MERGE_NOMERGE;
ret = ops->key_merge(c, l, r);
@@ -310,14 +317,15 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
const struct bkey_ops *ops;
struct bkey uk;
struct bkey_s u;
+ unsigned nr_compat = 5;
int i;
/*
* Do these operations in reverse order in the write path:
*/
- for (i = 0; i < 4; i++)
- switch (!write ? i : 3 - i) {
+ for (i = 0; i < nr_compat; i++)
+ switch (!write ? i : nr_compat - 1 - i) {
case 0:
if (big_endian != CPU_BIG_ENDIAN)
bch2_bkey_swab_key(f, k);
@@ -351,6 +359,28 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
}
break;
case 3:
+ if (version < bcachefs_metadata_version_snapshot &&
+ (level || btree_type_has_snapshots(btree_id))) {
+ struct bkey_i *u = packed_to_bkey(k);
+
+ if (u) {
+ u->k.p.snapshot = write
+ ? 0 : U32_MAX;
+ } else {
+ u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT];
+ u64 max_packed = min_packed +
+ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+ uk = __bch2_bkey_unpack_key(f, k);
+ uk.p.snapshot = write
+ ? min_packed : min_t(u64, U32_MAX, max_packed);
+
+ BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
+ }
+ }
+
+ break;
+ case 4:
if (!bkey_packed(k)) {
u = bkey_i_to_s(packed_to_bkey(k));
} else {
diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c
index f2507079..537ab791 100644
--- a/libbcachefs/bkey_sort.c
+++ b/libbcachefs/bkey_sort.c
@@ -45,7 +45,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
BUG_ON(!iter->used);
- i->k = bkey_next_skip_noops(i->k, i->end);
+ i->k = bkey_next(i->k);
BUG_ON(i->k > i->end);
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 87f951e1..3fb9a9ed 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -78,7 +78,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
for (_k = i->start;
_k < vstruct_last(i);
_k = _n) {
- _n = bkey_next_skip_noops(_k, vstruct_last(i));
+ _n = bkey_next(_k);
k = bkey_disassemble(b, _k, &uk);
if (c)
@@ -93,13 +93,13 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
n = bkey_unpack_key(b, _n);
- if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) {
+ if (bpos_cmp(n.p, k.k->p) < 0) {
printk(KERN_ERR "Key skipped backwards\n");
continue;
}
if (!bkey_deleted(k.k) &&
- !bkey_cmp(n.p, k.k->p))
+ !bpos_cmp(n.p, k.k->p))
printk(KERN_ERR "Duplicate keys\n");
}
}
@@ -534,7 +534,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
goto start;
while (1) {
if (rw_aux_to_bkey(b, t, j) == k) {
- BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
+ BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k,
bkey_unpack_pos(b, k)));
start:
if (++j == t->size)
@@ -544,7 +544,7 @@ start:
rw_aux_tree(b, t)[j - 1].offset);
}
- k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+ k = bkey_next(k);
BUG_ON(k >= btree_bkey_last(b, t));
}
}
@@ -686,16 +686,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
if (is_power_of_2(j) &&
!min_key->u64s) {
- k = (void *) min_key;
- bkey_init(&k->k);
- k->k.p = b->data->min_key;
+ if (!bkey_pack_pos(min_key, b->data->min_key, b)) {
+ k = (void *) min_key;
+ bkey_init(&k->k);
+ k->k.p = b->data->min_key;
+ }
}
if (is_power_of_2(j + 1) &&
!max_key->u64s) {
- k = (void *) max_key;
- bkey_init(&k->k);
- k->k.p = t->max_key;
+ if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
+ k = (void *) max_key;
+ bkey_init(&k->k);
+ k->k.p = t->max_key;
+ }
}
__make_bfloat(b, t, j, min_key, max_key);
@@ -759,7 +763,7 @@ retry:
/* First we figure out where the first key in each cacheline is */
eytzinger1_for_each(j, t->size) {
while (bkey_to_cacheline(b, t, k) < cacheline)
- prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+ prev = k, k = bkey_next(k);
if (k >= btree_bkey_last(b, t)) {
/* XXX: this path sucks */
@@ -776,14 +780,19 @@ retry:
}
while (k != btree_bkey_last(b, t))
- prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+ prev = k, k = bkey_next(k);
t->max_key = bkey_unpack_pos(b, prev);
- bkey_init(&min_key.k);
- min_key.k.p = b->data->min_key;
- bkey_init(&max_key.k);
- max_key.k.p = t->max_key;
+ if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
+ bkey_init(&min_key.k);
+ min_key.k.p = b->data->min_key;
+ }
+
+ if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
+ bkey_init(&max_key.k);
+ max_key.k.p = t->max_key;
+ }
/* Then we build the tree */
eytzinger1_for_each(j, t->size)
@@ -911,7 +920,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
while ((p = __bkey_prev(b, t, k)) && !ret) {
- for (i = p; i != k; i = bkey_next_skip_noops(i, k))
+ for (i = p; i != k; i = bkey_next(i))
if (i->type >= min_key_type)
ret = i;
@@ -922,10 +931,10 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
BUG_ON(ret >= orig_k);
for (i = ret
- ? bkey_next_skip_noops(ret, orig_k)
+ ? bkey_next(ret)
: btree_bkey_first(b, t);
i != orig_k;
- i = bkey_next_skip_noops(i, orig_k))
+ i = bkey_next(i))
BUG_ON(i->type >= min_key_type);
}
@@ -960,7 +969,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
/* signal to make_bfloat() that they're uninitialized: */
min_key.u64s = max_key.u64s = 0;
- if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) {
+ if (bkey_next(k) == btree_bkey_last(b, t)) {
t->max_key = bkey_unpack_pos(b, k);
for (j = 1; j < t->size; j = j * 2 + 1)
@@ -1084,7 +1093,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
struct bkey_packed *k = start;
while (1) {
- k = bkey_next_skip_noops(k, end);
+ k = bkey_next(k);
if (k == end)
break;
@@ -1170,15 +1179,14 @@ void bch2_bset_delete(struct btree *b,
__flatten
static struct bkey_packed *bset_search_write_set(const struct btree *b,
struct bset_tree *t,
- struct bpos *search,
- const struct bkey_packed *packed_search)
+ struct bpos *search)
{
unsigned l = 0, r = t->size;
while (l + 1 != r) {
unsigned m = (l + r) >> 1;
- if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
+ if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
l = m;
else
r = m;
@@ -1238,9 +1246,6 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
prefetch(&base->f[n << 4]);
f = &base->f[n];
-
- if (!unlikely(packed_search))
- goto slowpath;
if (unlikely(f->exponent >= BFLOAT_FAILED))
goto slowpath;
@@ -1304,7 +1309,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
case BSET_NO_AUX_TREE:
return btree_bkey_first(b, t);
case BSET_RW_AUX_TREE:
- return bset_search_write_set(b, t, search, lossy_packed_search);
+ return bset_search_write_set(b, t, search);
case BSET_RO_AUX_TREE:
/*
* Each node in the auxiliary search tree covers a certain range
@@ -1313,7 +1318,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
* start and end - handle that here:
*/
- if (bkey_cmp(*search, t->max_key) > 0)
+ if (bpos_cmp(*search, t->max_key) > 0)
return btree_bkey_last(b, t);
return bset_search_tree(b, t, search, lossy_packed_search);
@@ -1334,12 +1339,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
while (m != btree_bkey_last(b, t) &&
bkey_iter_cmp_p_or_unp(b, m,
lossy_packed_search, search) < 0)
- m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
+ m = bkey_next(m);
if (!packed_search)
while (m != btree_bkey_last(b, t) &&
bkey_iter_pos_cmp(b, m, search) < 0)
- m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
+ m = bkey_next(m);
if (bch2_expensive_debug_checks) {
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
@@ -1403,16 +1408,15 @@ noinline __flatten __attribute__((cold))
static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
struct btree *b, struct bpos *search)
{
- struct bset_tree *t;
+ struct bkey_packed *k;
trace_bkey_pack_pos_fail(search);
- for_each_bset(b, t)
- __bch2_btree_node_iter_push(iter, b,
- bch2_bset_search(b, t, search, NULL, NULL),
- btree_bkey_last(b, t));
+ bch2_btree_node_iter_init_from_start(iter, b);
- bch2_btree_node_iter_sort(iter, b);
+ while ((k = bch2_btree_node_iter_peek(iter, b)) &&
+ bkey_iter_pos_cmp(b, k, search) < 0)
+ bch2_btree_node_iter_advance(iter, b);
}
/**
@@ -1446,7 +1450,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
* to the search key is going to have 0 sectors after the search key.
*
* But this does mean that we can't just search for
- * bkey_successor(start_of_range) to get the first extent that overlaps with
+ * bpos_successor(start_of_range) to get the first extent that overlaps with
* the range we want - if we're unlucky and there's an extent that ends
* exactly where we searched, then there could be a deleted key at the same
* position and we'd get that when we search instead of the preceding extent
@@ -1464,7 +1468,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
struct bkey_packed *k[MAX_BSETS];
unsigned i;
- EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
+ EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0);
bset_aux_tree_verify(b);
memset(iter, 0, sizeof(*iter));
diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h
index 54b364c8..506da4e0 100644
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@@ -305,7 +305,7 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
#define bset_tree_for_each_key(_b, _t, _k) \
for (_k = btree_bkey_first(_b, _t); \
_k != btree_bkey_last(_b, _t); \
- _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t)))
+ _k = bkey_next(_k))
static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
{
@@ -378,7 +378,7 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
EBUG_ON(r_packed && !bkey_packed(r_packed));
if (unlikely(!bkey_packed(l)))
- return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+ return bpos_cmp(packed_to_bkey_c(l)->p, *r);
if (likely(r_packed))
return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
@@ -403,24 +403,6 @@ bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
return bch2_bkey_prev_filter(b, t, k, 1);
}
-enum bch_extent_overlap {
- BCH_EXTENT_OVERLAP_ALL = 0,
- BCH_EXTENT_OVERLAP_BACK = 1,
- BCH_EXTENT_OVERLAP_FRONT = 2,
- BCH_EXTENT_OVERLAP_MIDDLE = 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
- const struct bkey *m)
-{
- int cmp1 = bkey_cmp(k->p, m->p) < 0;
- int cmp2 = bkey_cmp(bkey_start_pos(k),
- bkey_start_pos(m)) > 0;
-
- return (cmp1 << 1) + cmp2;
-}
-
/* Btree key iteration */
void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index fc76e788..8a4667ba 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -149,7 +149,7 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
if (level)
six_lock_pcpu_alloc(&b->c.lock);
else
- six_lock_pcpu_free(&b->c.lock);
+ six_lock_pcpu_free_rcu(&b->c.lock);
mutex_lock(&bc->lock);
ret = __bch2_btree_node_hash_insert(bc, b);
@@ -814,9 +814,9 @@ lock_node:
EBUG_ON(b->c.btree_id != iter->btree_id);
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
- EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+ EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- bkey_cmp(b->data->min_key,
+ bpos_cmp(b->data->min_key,
bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
return b;
@@ -897,9 +897,9 @@ lock_node:
EBUG_ON(b->c.btree_id != btree_id);
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
- EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+ EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- bkey_cmp(b->data->min_key,
+ bpos_cmp(b->data->min_key,
bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
out:
bch2_btree_cache_cannibalize_unlock(c);
@@ -1011,7 +1011,7 @@ out:
if (sib != btree_prev_sib)
swap(n1, n2);
- if (bkey_cmp(bkey_successor(n1->key.k.p),
+ if (bpos_cmp(bpos_successor(n1->key.k.p),
n2->data->min_key)) {
char buf1[200], buf2[200];
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 6d5ed774..88c549c4 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -64,7 +64,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
struct bpos node_end = b->data->max_key;
struct bpos expected_start = bkey_deleted(&prev->k->k)
? node_start
- : bkey_successor(prev->k->k.p);
+ : bpos_successor(prev->k->k.p);
char buf1[200], buf2[200];
bool update_min = false;
bool update_max = false;
@@ -81,7 +81,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
}
- if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c,
+ if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c,
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" cur %s",
@@ -92,7 +92,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
}
if (fsck_err_on(is_last &&
- bkey_cmp(cur.k->k.p, node_end), c,
+ bpos_cmp(cur.k->k.p, node_end), c,
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
@@ -470,8 +470,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
bkey_init(&prev.k->k);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
- BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
+ BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+ BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
k, &max_stale, true);
@@ -560,13 +560,13 @@ static int bch2_gc_btree_init(struct bch_fs *c,
return 0;
six_lock_read(&b->c.lock, NULL, NULL);
- if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c,
+ if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
"btree root with incorrect min_key: %s",
(bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
BUG();
}
- if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c,
+ if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
"btree root with incorrect max_key: %s",
(bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
BUG();
@@ -1148,7 +1148,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k))) {
@@ -1171,6 +1173,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
bch2_btree_iter_advance(iter);
}
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
@@ -1271,6 +1274,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
/* Find a format that all keys in @old_nodes can pack into */
bch2_bkey_format_init(&format_state);
+ /*
+ * XXX: this won't correctly take it account the new min/max keys:
+ */
for (i = 0; i < nr_old_nodes; i++)
__bch2_btree_calc_format(&format_state, old_nodes[i]);
@@ -1333,7 +1339,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
k < vstruct_last(s2) &&
vstruct_blocks_plus(n1->data, c->block_bits,
u64s + k->u64s) <= blocks;
- k = bkey_next_skip_noops(k, vstruct_last(s2))) {
+ k = bkey_next(k)) {
last = k;
u64s += k->u64s;
}
@@ -1362,7 +1368,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
n1->key.k.p = n1->data->max_key =
bkey_unpack_pos(n1, last);
- n2->data->min_key = bkey_successor(n1->data->max_key);
+ n2->data->min_key = bpos_successor(n1->data->max_key);
memcpy_u64s(vstruct_last(s1),
s2->start, u64s);
@@ -1405,7 +1411,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
unsigned j;
for (j = 0; j < nr_new_nodes; j++)
- if (!bkey_cmp(old_nodes[i]->key.k.p,
+ if (!bpos_cmp(old_nodes[i]->key.k.p,
new_nodes[j]->key.k.p))
goto next;
diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h
index c3d02f58..b1362a9f 100644
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@@ -45,13 +45,9 @@ static inline struct gc_pos gc_phase(enum gc_phase phase)
static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
- if (l.phase != r.phase)
- return l.phase < r.phase ? -1 : 1;
- if (bkey_cmp(l.pos, r.pos))
- return bkey_cmp(l.pos, r.pos);
- if (l.level != r.level)
- return l.level < r.level ? -1 : 1;
- return 0;
+ return cmp_int(l.phase, r.phase) ?:
+ bpos_cmp(l.pos, r.pos) ?:
+ cmp_int(l.level, r.level);
}
static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 9b74e799..b43d4468 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -32,13 +32,13 @@ static void verify_no_dups(struct btree *b,
if (start == end)
return;
- for (p = start, k = bkey_next_skip_noops(start, end);
+ for (p = start, k = bkey_next(start);
k != end;
- p = k, k = bkey_next_skip_noops(k, end)) {
+ p = k, k = bkey_next(k)) {
struct bkey l = bkey_unpack_key(b, p);
struct bkey r = bkey_unpack_key(b, k);
- BUG_ON(bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
+ BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0);
}
#endif
}
@@ -47,9 +47,7 @@ static void set_needs_whiteout(struct bset *i, int v)
{
struct bkey_packed *k;
- for (k = i->start;
- k != vstruct_last(i);
- k = bkey_next_skip_noops(k, vstruct_last(i)))
+ for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
k->needs_whiteout = v;
}
@@ -213,7 +211,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
out = i->start;
for (k = start; k != end; k = n) {
- n = bkey_next_skip_noops(k, end);
+ n = bkey_next(k);
if (!bkey_deleted(k)) {
bkey_copy(out, k);
@@ -614,12 +612,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
BTREE_ERR_MUST_RETRY, c, ca, b, i,
"incorrect level");
- if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
- u64 *p = (u64 *) &bn->ptr;
-
- *p = swab64(*p);
- }
-
if (!write)
compat_btree_node(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write, bn);
@@ -633,14 +625,14 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
b->data->max_key = b->key.k.p;
}
- btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
+ btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"incorrect min_key: got %s should be %s",
(bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
(bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
}
- btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
+ btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
BTREE_ERR_MUST_RETRY, c, ca, b, i,
"incorrect max key %s",
(bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
@@ -754,7 +746,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
}
prev = k;
- k = bkey_next_skip_noops(k, vstruct_last(i));
+ k = bkey_next(k);
}
fsck_err:
return ret;
@@ -947,7 +939,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bp.v->mem_ptr = 0;
}
- k = bkey_next_skip_noops(k, vstruct_last(i));
+ k = bkey_next(k);
}
bch2_bset_build_aux_tree(b, b->set, false);
@@ -1327,8 +1319,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
return -1;
- ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?:
- validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
+ ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
+ validate_bset(c, NULL, b, i, sectors, WRITE, false);
if (ret) {
bch2_inconsistent_error(c);
dump_stack();
@@ -1481,7 +1473,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
validate_before_checksum = true;
/* validate_bset will be modifying: */
- if (le16_to_cpu(i->version) <= bcachefs_metadata_version_inode_btree_change)
+ if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
validate_before_checksum = true;
/* if we're going to be encrypting, check metadata validity first: */
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 16ce6dff..9c14cd30 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -189,8 +189,8 @@ void bch2_btree_flush_all_writes(struct bch_fs *);
void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
static inline void compat_bformat(unsigned level, enum btree_id btree_id,
- unsigned version, unsigned big_endian,
- int write, struct bkey_format *f)
+ unsigned version, unsigned big_endian,
+ int write, struct bkey_format *f)
{
if (version < bcachefs_metadata_version_inode_btree_change &&
btree_id == BTREE_ID_inodes) {
@@ -199,6 +199,16 @@ static inline void compat_bformat(unsigned level, enum btree_id btree_id,
swap(f->field_offset[BKEY_FIELD_INODE],
f->field_offset[BKEY_FIELD_OFFSET]);
}
+
+ if (version < bcachefs_metadata_version_snapshot &&
+ (level || btree_type_has_snapshots(btree_id))) {
+ u64 max_packed =
+ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+ f->field_offset[BKEY_FIELD_SNAPSHOT] = write
+ ? 0
+ : U32_MAX - max_packed;
+ }
}
static inline void compat_bpos(unsigned level, enum btree_id btree_id,
@@ -220,18 +230,26 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
{
if (version < bcachefs_metadata_version_inode_btree_change &&
btree_node_type_is_extents(btree_id) &&
- bkey_cmp(bn->min_key, POS_MIN) &&
+ bpos_cmp(bn->min_key, POS_MIN) &&
write)
- bn->min_key = bkey_predecessor(bn->min_key);
+ bn->min_key = bpos_nosnap_predecessor(bn->min_key);
+
+ if (version < bcachefs_metadata_version_snapshot &&
+ write)
+ bn->max_key.snapshot = 0;
compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
+ if (version < bcachefs_metadata_version_snapshot &&
+ !write)
+ bn->max_key.snapshot = U32_MAX;
+
if (version < bcachefs_metadata_version_inode_btree_change &&
btree_node_type_is_extents(btree_id) &&
- bkey_cmp(bn->min_key, POS_MIN) &&
+ bpos_cmp(bn->min_key, POS_MIN) &&
!write)
- bn->min_key = bkey_successor(bn->min_key);
+ bn->min_key = bpos_nosnap_successor(bn->min_key);
}
#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 459d27ca..8190e73d 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -18,6 +18,36 @@
static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
+ EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+ /* Are we iterating over keys in all snapshots? */
+ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+ p = bpos_successor(p);
+ } else {
+ p = bpos_nosnap_successor(p);
+ p.snapshot = iter->snapshot;
+ }
+
+ return p;
+}
+
+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
+{
+ EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+ /* Are we iterating over keys in all snapshots? */
+ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+ p = bpos_predecessor(p);
+ } else {
+ p = bpos_nosnap_predecessor(p);
+ p.snapshot = iter->snapshot;
+ }
+
+ return p;
+}
+
static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
{
return l < BTREE_MAX_DEPTH &&
@@ -30,20 +60,20 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
bkey_cmp(pos, POS_MAX))
- pos = bkey_successor(pos);
+ pos = bkey_successor(iter, pos);
return pos;
}
static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
struct btree *b)
{
- return bkey_cmp(iter->real_pos, b->data->min_key) < 0;
+ return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
}
static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
struct btree *b)
{
- return bkey_cmp(b->key.k.p, iter->real_pos) < 0;
+ return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
}
static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
@@ -285,7 +315,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
/* Must lock btree nodes in key order: */
if (btree_node_locked(linked, level) &&
- bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
+ bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
btree_iter_type(linked))) <= 0) {
deadlock_iter = linked;
reason = 7;
@@ -583,10 +613,24 @@ err:
static void bch2_btree_iter_verify(struct btree_iter *iter)
{
+ enum btree_iter_type type = btree_iter_type(iter);
unsigned i;
EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ iter->pos.snapshot != iter->snapshot);
+
+ BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+ BUG_ON(type == BTREE_ITER_NODES &&
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+ BUG_ON(type != BTREE_ITER_NODES &&
+ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ !btree_type_has_snapshots(iter->btree_id));
+
bch2_btree_iter_verify_locks(iter);
for (i = 0; i < BTREE_MAX_DEPTH; i++)
@@ -597,6 +641,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
{
enum btree_iter_type type = btree_iter_type(iter);
+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ iter->pos.snapshot != iter->snapshot);
+
BUG_ON((type == BTREE_ITER_KEYS ||
type == BTREE_ITER_CACHED) &&
(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
@@ -1384,7 +1431,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
if (!b)
return NULL;
- BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+ BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
iter->pos = iter->real_pos = b->key.k.p;
@@ -1421,12 +1468,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
if (!b)
return NULL;
- if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
+ if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
/*
* Haven't gotten to the end of the parent node: go back down to
* the next child node
*/
- btree_iter_set_search_pos(iter, bkey_successor(iter->pos));
+ btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
/* Unlock to avoid screwing up our lock invariants: */
btree_node_unlock(iter, iter->level);
@@ -1453,7 +1500,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
{
- int cmp = bkey_cmp(new_pos, iter->real_pos);
+ int cmp = bpos_cmp(new_pos, iter->real_pos);
unsigned l = iter->level;
if (!cmp)
@@ -1497,10 +1544,10 @@ out:
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
struct bpos pos = iter->k.p;
- bool ret = bkey_cmp(pos, POS_MAX) != 0;
+ bool ret = bpos_cmp(pos, POS_MAX) != 0;
if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
- pos = bkey_successor(pos);
+ pos = bkey_successor(iter, pos);
bch2_btree_iter_set_pos(iter, pos);
return ret;
}
@@ -1508,10 +1555,10 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
{
struct bpos pos = bkey_start_pos(&iter->k);
- bool ret = bkey_cmp(pos, POS_MIN) != 0;
+ bool ret = bpos_cmp(pos, POS_MIN) != 0;
if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
- pos = bkey_predecessor(pos);
+ pos = bkey_predecessor(iter, pos);
bch2_btree_iter_set_pos(iter, pos);
return ret;
}
@@ -1519,7 +1566,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
{
struct bpos next_pos = iter->l[0].b->key.k.p;
- bool ret = bkey_cmp(next_pos, POS_MAX) != 0;
+ bool ret = bpos_cmp(next_pos, POS_MAX) != 0;
/*
* Typically, we don't want to modify iter->pos here, since that
@@ -1527,7 +1574,7 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
* btree, in that case we want iter->pos to reflect that:
*/
if (ret)
- btree_iter_set_search_pos(iter, bkey_successor(next_pos));
+ btree_iter_set_search_pos(iter, bpos_successor(next_pos));
else
bch2_btree_iter_set_pos(iter, POS_MAX);
@@ -1537,10 +1584,10 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
{
struct bpos next_pos = iter->l[0].b->data->min_key;
- bool ret = bkey_cmp(next_pos, POS_MIN) != 0;
+ bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
if (ret)
- btree_iter_set_search_pos(iter, bkey_predecessor(next_pos));
+ btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
else
bch2_btree_iter_set_pos(iter, POS_MIN);
@@ -1586,13 +1633,13 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi
k = btree_iter_level_peek(iter, &iter->l[0]);
if (next_update &&
- bkey_cmp(next_update->k.p, iter->real_pos) <= 0)
+ bpos_cmp(next_update->k.p, iter->real_pos) <= 0)
k = bkey_i_to_s_c(next_update);
if (likely(k.k)) {
if (bkey_deleted(k.k)) {
btree_iter_set_search_pos(iter,
- bkey_successor(k.k->p));
+ bkey_successor(iter, k.k->p));
continue;
}
@@ -1731,7 +1778,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
if (iter->pos.inode == KEY_INODE_MAX)
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
+ bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos));
}
pos = iter->pos;
@@ -1965,6 +2012,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
{
struct btree_iter *iter, *best = NULL;
+ if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+ !btree_type_has_snapshots(btree_id))
+ flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+
+ if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
+ pos.snapshot = btree_type_has_snapshots(btree_id)
+ ? U32_MAX : 0;
+
/* We always want a fresh iterator for node iterators: */
if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES)
goto alloc_iter;
@@ -1999,11 +2054,14 @@ alloc_iter:
if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
btree_node_type_is_extents(btree_id) &&
- !(flags & BTREE_ITER_NOT_EXTENTS))
+ !(flags & BTREE_ITER_NOT_EXTENTS) &&
+ !(flags & BTREE_ITER_ALL_SNAPSHOTS))
flags |= BTREE_ITER_IS_EXTENTS;
iter->flags = flags;
+ iter->snapshot = pos.snapshot;
+
if (!(iter->flags & BTREE_ITER_INTENT))
bch2_btree_iter_downgrade(iter);
else if (!iter->locks_want)
@@ -2026,6 +2084,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
__bch2_trans_get_iter(trans, btree_id, pos,
BTREE_ITER_NODES|
BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
flags);
unsigned i;
@@ -2127,6 +2186,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
trans->nr_updates2 = 0;
trans->mem_top = 0;
+ trans->hooks = NULL;
trans->extra_journal_entries = NULL;
trans->extra_journal_entry_u64s = 0;
@@ -2137,7 +2197,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
(void *) &trans->fs_usage_deltas->memset_start);
}
- bch2_trans_cond_resched(trans);
+ if (!(flags & TRANS_RESET_NOUNLOCK))
+ bch2_trans_cond_resched(trans);
if (!(flags & TRANS_RESET_NOTRAVERSE))
bch2_btree_iter_traverse_all(trans);
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 8768f4cb..7585f989 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -172,6 +172,9 @@ bool bch2_btree_iter_rewind(struct btree_iter *);
static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ new_pos.snapshot = iter->snapshot;
+
bkey_init(&iter->k);
iter->k.p = iter->pos = new_pos;
}
@@ -303,6 +306,7 @@ static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btr
}
#define TRANS_RESET_NOTRAVERSE (1 << 0)
+#define TRANS_RESET_NOUNLOCK (1 << 1)
void bch2_trans_reset(struct btree_trans *, unsigned);
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 0b354563..04354f56 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -21,7 +21,7 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
const struct bkey_cached_key *key = arg->key;
return cmp_int(ck->key.btree_id, key->btree_id) ?:
- bkey_cmp(ck->key.pos, key->pos);
+ bpos_cmp(ck->key.pos, key->pos);
}
static const struct rhashtable_params bch2_btree_key_cache_params = {
@@ -70,7 +70,7 @@ static void bkey_cached_evict(struct btree_key_cache *c,
bch2_btree_key_cache_params));
memset(&ck->key, ~0, sizeof(ck->key));
- c->nr_keys--;
+ atomic_long_dec(&c->nr_keys);
}
static void bkey_cached_free(struct btree_key_cache *bc,
@@ -99,12 +99,6 @@ bkey_cached_alloc(struct btree_key_cache *c)
{
struct bkey_cached *ck;
- list_for_each_entry_reverse(ck, &c->freed, list)
- if (bkey_cached_lock_for_evict(ck)) {
- c->nr_freed--;
- return ck;
- }
-
ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
if (likely(ck)) {
INIT_LIST_HEAD(&ck->list);
@@ -114,11 +108,39 @@ bkey_cached_alloc(struct btree_key_cache *c)
return ck;
}
- list_for_each_entry(ck, &c->clean, list)
+ return NULL;
+}
+
+static struct bkey_cached *
+bkey_cached_reuse(struct btree_key_cache *c)
+{
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct bkey_cached *ck;
+ unsigned i;
+
+ mutex_lock(&c->lock);
+ list_for_each_entry_reverse(ck, &c->freed, list)
if (bkey_cached_lock_for_evict(ck)) {
- bkey_cached_evict(c, ck);
+ c->nr_freed--;
+ list_del(&ck->list);
+ mutex_unlock(&c->lock);
return ck;
}
+ mutex_unlock(&c->lock);
+
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+ for (i = 0; i < tbl->size; i++)
+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ bkey_cached_lock_for_evict(ck)) {
+ bkey_cached_evict(c, ck);
+ rcu_read_unlock();
+ return ck;
+ }
+ }
+ rcu_read_unlock();
return NULL;
}
@@ -129,10 +151,17 @@ btree_key_cache_create(struct btree_key_cache *c,
struct bpos pos)
{
struct bkey_cached *ck;
+ bool was_new = true;
ck = bkey_cached_alloc(c);
- if (!ck)
- return ERR_PTR(-ENOMEM);
+
+ if (unlikely(!ck)) {
+ ck = bkey_cached_reuse(c);
+ if (unlikely(!ck))
+ return ERR_PTR(-ENOMEM);
+
+ was_new = false;
+ }
ck->c.level = 0;
ck->c.btree_id = btree_id;
@@ -141,17 +170,26 @@ btree_key_cache_create(struct btree_key_cache *c,
ck->valid = false;
ck->flags = 1U << BKEY_CACHED_ACCESSED;
- if (rhashtable_lookup_insert_fast(&c->table,
+ if (unlikely(rhashtable_lookup_insert_fast(&c->table,
&ck->hash,
- bch2_btree_key_cache_params)) {
+ bch2_btree_key_cache_params))) {
/* We raced with another fill: */
- bkey_cached_free(c, ck);
+
+ if (likely(was_new)) {
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
+ kfree(ck);
+ } else {
+ mutex_lock(&c->lock);
+ bkey_cached_free(c, ck);
+ mutex_unlock(&c->lock);
+ }
+
return NULL;
}
- c->nr_keys++;
+ atomic_long_inc(&c->nr_keys);
- list_move(&ck->list, &c->clean);
six_unlock_write(&ck->c.lock);
return ck;
@@ -213,7 +251,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p)
const struct btree_iter *iter = p;
return ck->key.btree_id == iter->btree_id &&
- !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
+ !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1;
}
__flatten
@@ -238,11 +276,8 @@ retry:
return 0;
}
- mutex_lock(&c->btree_key_cache.lock);
ck = btree_key_cache_create(&c->btree_key_cache,
iter->btree_id, iter->pos);
- mutex_unlock(&c->btree_key_cache.lock);
-
ret = PTR_ERR_OR_ZERO(ck);
if (ret)
goto err;
@@ -257,7 +292,7 @@ retry:
if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
bkey_cached_check_fn, iter, _THIS_IP_)) {
if (ck->key.btree_id != iter->btree_id ||
- bkey_cmp(ck->key.pos, iter->pos)) {
+ bpos_cmp(ck->key.pos, iter->pos)) {
goto retry;
}
@@ -267,7 +302,7 @@ retry:
}
if (ck->key.btree_id != iter->btree_id ||
- bkey_cmp(ck->key.pos, iter->pos)) {
+ bpos_cmp(ck->key.pos, iter->pos)) {
six_unlock_type(&ck->c.lock, lock_want);
goto retry;
}
@@ -370,15 +405,13 @@ err:
bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
+ BUG_ON(!btree_node_locked(c_iter, 0));
+
if (!evict) {
- mutex_lock(&c->btree_key_cache.lock);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
- c->btree_key_cache.nr_dirty--;
+ atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
-
- list_move_tail(&ck->list, &c->btree_key_cache.clean);
- mutex_unlock(&c->btree_key_cache.lock);
} else {
evict:
BUG_ON(!btree_node_intent_locked(c_iter, 0));
@@ -388,13 +421,14 @@ evict:
six_lock_write(&ck->c.lock, NULL, NULL);
- mutex_lock(&c->btree_key_cache.lock);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
- c->btree_key_cache.nr_dirty--;
+ atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
bkey_cached_evict(&c->btree_key_cache, ck);
+
+ mutex_lock(&c->btree_key_cache.lock);
bkey_cached_free(&c->btree_key_cache, ck);
mutex_unlock(&c->btree_key_cache.lock);
}
@@ -475,16 +509,11 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
ck->valid = true;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- mutex_lock(&c->btree_key_cache.lock);
- list_move(&ck->list, &c->btree_key_cache.dirty);
-
set_bit(BKEY_CACHED_DIRTY, &ck->flags);
- c->btree_key_cache.nr_dirty++;
+ atomic_long_inc(&c->btree_key_cache.nr_dirty);
if (bch2_nr_btree_keys_need_flush(c))
kick_reclaim = true;
-
- mutex_unlock(&c->btree_key_cache.lock);
}
bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
@@ -509,9 +538,11 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct bch_fs *c = container_of(shrink, struct bch_fs,
btree_key_cache.shrink);
struct btree_key_cache *bc = &c->btree_key_cache;
+ struct bucket_table *tbl;
struct bkey_cached *ck, *t;
size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
- unsigned flags;
+ unsigned start, flags;
+ int srcu_idx;
/* Return -1 if we can't do anything right now */
if (sc->gfp_mask & __GFP_FS)
@@ -519,6 +550,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
else if (!mutex_trylock(&bc->lock))
return -1;
+ srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
flags = memalloc_nofs_save();
/*
@@ -540,23 +572,40 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
if (scanned >= nr)
goto out;
- list_for_each_entry_safe(ck, t, &bc->clean, list) {
- if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
- clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- else if (bkey_cached_lock_for_evict(ck)) {
- bkey_cached_evict(bc, ck);
- bkey_cached_free(bc, ck);
- }
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+ if (bc->shrink_iter >= tbl->size)
+ bc->shrink_iter = 0;
+ start = bc->shrink_iter;
- scanned++;
- if (scanned >= nr) {
- if (&t->list != &bc->clean)
- list_move_tail(&bc->clean, &t->list);
- goto out;
+ do {
+ struct rhash_head *pos, *next;
+
+ rht_for_each_entry_safe(ck, pos, next, tbl, bc->shrink_iter, hash) {
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+ continue;
+
+ if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+ else if (bkey_cached_lock_for_evict(ck)) {
+ bkey_cached_evict(bc, ck);
+ bkey_cached_free(bc, ck);
+ }
+
+ scanned++;
+ if (scanned >= nr)
+ break;
}
- }
+
+ bc->shrink_iter++;
+ if (bc->shrink_iter >= tbl->size)
+ bc->shrink_iter = 0;
+ } while (scanned < nr && bc->shrink_iter != start);
+
+ rcu_read_unlock();
out:
memalloc_nofs_restore(flags);
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
mutex_unlock(&bc->lock);
return freed;
@@ -569,41 +618,45 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
btree_key_cache.shrink);
struct btree_key_cache *bc = &c->btree_key_cache;
- return bc->nr_keys;
+ return atomic_long_read(&bc->nr_keys);
}
void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+ struct bucket_table *tbl;
struct bkey_cached *ck, *n;
+ struct rhash_head *pos;
+ unsigned i;
if (bc->shrink.list.next)
unregister_shrinker(&bc->shrink);
mutex_lock(&bc->lock);
- list_splice(&bc->dirty, &bc->clean);
- list_for_each_entry_safe(ck, n, &bc->clean, list) {
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+ for (i = 0; i < tbl->size; i++)
+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+ bkey_cached_evict(bc, ck);
+ list_add(&ck->list, &bc->freed);
+ }
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(ck, n, &bc->freed, list) {
cond_resched();
bch2_journal_pin_drop(&c->journal, &ck->journal);
bch2_journal_preres_put(&c->journal, &ck->res);
- kfree(ck->k);
list_del(&ck->list);
+ kfree(ck->k);
kmem_cache_free(bch2_key_cache, ck);
- bc->nr_keys--;
}
- BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal));
- BUG_ON(bc->nr_keys);
-
- list_for_each_entry_safe(ck, n, &bc->freed, list) {
- cond_resched();
+ BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal));
+ BUG_ON(atomic_long_read(&bc->nr_keys));
- list_del(&ck->list);
- kmem_cache_free(bch2_key_cache, ck);
- }
mutex_unlock(&bc->lock);
if (bc->table_init_done)
@@ -614,8 +667,6 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->freed);
- INIT_LIST_HEAD(&c->clean);
- INIT_LIST_HEAD(&c->dirty);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
@@ -641,8 +692,8 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
{
pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed);
- pr_buf(out, "nr_keys:\t%zu\n", c->nr_keys);
- pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty);
+ pr_buf(out, "nr_keys:\t%zu\n", atomic_long_read(&c->nr_keys));
+ pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty));
}
void bch2_btree_key_cache_exit(void)
diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h
index 2f8b5521..02715cd2 100644
--- a/libbcachefs/btree_key_cache.h
+++ b/libbcachefs/btree_key_cache.h
@@ -3,8 +3,8 @@
static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
{
- size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
- size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
+ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
size_t max_dirty = 1024 + nr_keys / 2;
return max_t(ssize_t, 0, nr_dirty - max_dirty);
@@ -12,8 +12,8 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
{
- size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
- size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
+ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
size_t max_dirty = 4096 + (nr_keys * 3) / 4;
return nr_dirty > max_dirty &&
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 5999044a..1941616f 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -216,6 +216,7 @@ enum btree_iter_type {
#define BTREE_ITER_CACHED_NOFILL (1 << 9)
#define BTREE_ITER_CACHED_NOCREATE (1 << 10)
#define BTREE_ITER_NOT_EXTENTS (1 << 11)
+#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
enum btree_iter_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -245,6 +246,8 @@ struct btree_iter {
/* what we're searching for/what the iterator actually points to: */
struct bpos real_pos;
struct bpos pos_after_commit;
+ /* When we're filtering by snapshot, the snapshot ID we're looking for: */
+ unsigned snapshot;
u16 flags;
u8 idx;
@@ -292,13 +295,12 @@ struct btree_key_cache {
struct rhashtable table;
bool table_init_done;
struct list_head freed;
- struct list_head clean;
- struct list_head dirty;
struct shrinker shrink;
+ unsigned shrink_iter;
size_t nr_freed;
- size_t nr_keys;
- size_t nr_dirty;
+ atomic_long_t nr_keys;
+ atomic_long_t nr_dirty;
};
struct bkey_cached_key {
@@ -330,7 +332,7 @@ struct bkey_cached {
struct btree_insert_entry {
unsigned trigger_flags;
u8 bkey_type;
- u8 btree_id;
+ enum btree_id btree_id:8;
u8 level;
unsigned trans_triggers_run:1;
unsigned is_extent:1;
@@ -344,6 +346,14 @@ struct btree_insert_entry {
#define BTREE_ITER_MAX 32
#endif
+struct btree_trans_commit_hook;
+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
+
+struct btree_trans_commit_hook {
+ btree_trans_commit_hook_fn *fn;
+ struct btree_trans_commit_hook *next;
+};
+
struct btree_trans {
struct bch_fs *c;
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -378,6 +388,7 @@ struct btree_trans {
struct btree_insert_entry *updates2;
/* update path: */
+ struct btree_trans_commit_hook *hooks;
struct jset_entry *extra_journal_entries;
unsigned extra_journal_entry_u64s;
struct journal_entry_pin *journal_pin;
@@ -600,6 +611,17 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+#define BTREE_ID_HAS_SNAPSHOTS \
+ ((1U << BTREE_ID_extents)| \
+ (1U << BTREE_ID_inodes)| \
+ (1U << BTREE_ID_dirents)| \
+ (1U << BTREE_ID_xattrs))
+
+static inline bool btree_type_has_snapshots(enum btree_id id)
+{
+ return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
+}
+
enum btree_trigger_flags {
__BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index a2513808..4ce12ae2 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -77,6 +77,8 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
int bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_trigger_flags);
+void bch2_trans_commit_hook(struct btree_trans *,
+ struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *);
/**
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index a661bc0c..19dfc32e 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -50,7 +50,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
break;
bp = bkey_s_c_to_btree_ptr_v2(k);
- if (bkey_cmp(next_node, bp.v->min_key)) {
+ if (bpos_cmp(next_node, bp.v->min_key)) {
bch2_dump_btree_node(c, b);
panic("expected next min_key %s got %s\n",
(bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
@@ -60,7 +60,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
bch2_btree_node_iter_advance(&iter, b);
if (bch2_btree_node_iter_end(&iter)) {
- if (bkey_cmp(k.k->p, b->key.k.p)) {
+ if (bpos_cmp(k.k->p, b->key.k.p)) {
bch2_dump_btree_node(c, b);
panic("expected end %s got %s\n",
(bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
@@ -69,7 +69,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
break;
}
- next_node = bkey_successor(k.k->p);
+ next_node = bpos_successor(k.k->p);
}
#endif
}
@@ -82,8 +82,6 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
struct bset_tree *t;
struct bkey uk;
- bch2_bkey_format_add_pos(s, b->data->min_key);
-
for_each_bset(b, t)
bset_tree_for_each_key(b, t, k)
if (!bkey_deleted(k)) {
@@ -97,6 +95,8 @@ static struct bkey_format bch2_btree_calc_format(struct btree *b)
struct bkey_format_state s;
bch2_bkey_format_init(&s);
+ bch2_bkey_format_add_pos(&s, b->data->min_key);
+ bch2_bkey_format_add_pos(&s, b->data->max_key);
__bch2_btree_calc_format(&s, b);
return bch2_bkey_format_done(&s);
@@ -289,7 +289,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
b->data->flags = 0;
SET_BTREE_NODE_ID(b->data, as->btree_id);
SET_BTREE_NODE_LEVEL(b->data, level);
- b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
@@ -1095,10 +1094,12 @@ static struct btree *__btree_split_node(struct btree_update *as,
struct btree *n1,
struct btree_iter *iter)
{
+ struct bkey_format_state s;
size_t nr_packed = 0, nr_unpacked = 0;
struct btree *n2;
struct bset *set1, *set2;
- struct bkey_packed *k, *prev = NULL;
+ struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
+ struct bpos n1_pos;
n2 = bch2_btree_node_alloc(as, n1->c.level);
bch2_btree_update_add_new_node(as, n2);
@@ -1108,8 +1109,6 @@ static struct btree *__btree_split_node(struct btree_update *as,
SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
n2->key.k.p = n1->key.k.p;
- btree_node_set_format(n2, n2->data->format);
-
set1 = btree_bset_first(n1);
set2 = btree_bset_first(n2);
@@ -1119,7 +1118,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
*/
k = set1->start;
while (1) {
- struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
+ struct bkey_packed *n = bkey_next(k);
if (n == vstruct_last(set1))
break;
@@ -1136,33 +1135,53 @@ static struct btree *__btree_split_node(struct btree_update *as,
}
BUG_ON(!prev);
+ set2_start = k;
+ set2_end = vstruct_last(set1);
- btree_set_max(n1, bkey_unpack_pos(n1, prev));
- btree_set_min(n2, bkey_successor(n1->key.k.p));
-
- set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
- set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
-
+ set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
set_btree_bset_end(n1, n1->set);
- set_btree_bset_end(n2, n2->set);
-
- n2->nr.live_u64s = le16_to_cpu(set2->u64s);
- n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s);
- n2->nr.packed_keys = n1->nr.packed_keys - nr_packed;
- n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked;
n1->nr.live_u64s = le16_to_cpu(set1->u64s);
n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s);
n1->nr.packed_keys = nr_packed;
n1->nr.unpacked_keys = nr_unpacked;
+ n1_pos = bkey_unpack_pos(n1, prev);
+ if (as->c->sb.version < bcachefs_metadata_version_snapshot)
+ n1_pos.snapshot = U32_MAX;
+
+ btree_set_max(n1, n1_pos);
+ btree_set_min(n2, bpos_successor(n1->key.k.p));
+
+ bch2_bkey_format_init(&s);
+ bch2_bkey_format_add_pos(&s, n2->data->min_key);
+ bch2_bkey_format_add_pos(&s, n2->data->max_key);
+
+ for (k = set2_start; k != set2_end; k = bkey_next(k)) {
+ struct bkey uk = bkey_unpack_key(n1, k);
+ bch2_bkey_format_add_key(&s, &uk);
+ }
+
+ n2->data->format = bch2_bkey_format_done(&s);
+ btree_node_set_format(n2, n2->data->format);
+
+ out = set2->start;
+ memset(&n2->nr, 0, sizeof(n2->nr));
+
+ for (k = set2_start; k != set2_end; k = bkey_next(k)) {
+ BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
+ ? &n1->format : &bch2_bkey_format_current, k));
+ out->format = KEY_FORMAT_LOCAL_BTREE;
+ btree_keys_account_key_add(&n2->nr, 0, out);
+ out = bkey_next(out);
+ }
+
+ set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
+ set_btree_bset_end(n2, n2->set);
+
BUG_ON(!set1->u64s);
BUG_ON(!set2->u64s);
- memcpy_u64s(set2->start,
- vstruct_end(set1),
- le16_to_cpu(set2->u64s));
-
btree_node_reset_sib_u64s(n1);
btree_node_reset_sib_u64s(n2);
@@ -1216,7 +1235,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
i = btree_bset_first(b);
src = dst = i->start;
while (src != vstruct_last(i)) {
- n = bkey_next_skip_noops(src, vstruct_last(i));
+ n = bkey_next(src);
if (!bkey_deleted(src)) {
memmove_u64s_down(dst, src, src->u64s);
dst = bkey_next(dst);
@@ -1563,8 +1582,10 @@ retry:
}
bch2_bkey_format_init(&new_s);
- __bch2_btree_calc_format(&new_s, b);
- __bch2_btree_calc_format(&new_s, m);
+ bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
+ __bch2_btree_calc_format(&new_s, prev);
+ __bch2_btree_calc_format(&new_s, next);
+ bch2_bkey_format_add_pos(&new_s, next->data->max_key);
new_f = bch2_bkey_format_done(&new_s);
sib_u64s = btree_node_u64s_with_format(b, &new_f) +
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index d9308bd4..67a2c65b 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -26,7 +26,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
{
return cmp_int(l->btree_id, r->btree_id) ?:
-cmp_int(l->level, r->level) ?:
- bkey_cmp(l->k->k.p, r->k->k.p);
+ bpos_cmp(l->k->k.p, r->k->k.p);
}
static inline bool same_leaf_as_prev(struct btree_trans *trans,
@@ -70,8 +70,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
EBUG_ON(btree_node_just_written(b));
EBUG_ON(bset_written(b, btree_bset_last(b)));
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
- EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0);
- EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0);
+ EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
+ EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
EBUG_ON(insert->k.u64s >
bch_btree_keys_u64s_remaining(iter->trans->c, b));
EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
@@ -223,9 +223,17 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
- BUG_ON(bch2_debug_check_bkeys &&
- bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type));
- BUG_ON(bkey_cmp(i->k->k.p, i->iter->real_pos));
+ if (bch2_debug_check_bkeys) {
+ const char *invalid = bch2_bkey_invalid(c,
+ bkey_i_to_s_c(i->k), i->bkey_type);
+ if (invalid) {
+ char buf[200];
+
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+ panic("invalid bkey %s on insert: %s\n", buf, invalid);
+ }
+ }
+ BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
BUG_ON(i->level != i->iter->level);
BUG_ON(i->btree_id != i->iter->btree_id);
}
@@ -369,6 +377,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bch_fs_usage *fs_usage = NULL;
struct btree_insert_entry *i;
+ struct btree_trans_commit_hook *h;
unsigned u64s = 0;
bool marking = false;
int ret;
@@ -386,6 +395,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
prefetch(&trans->c->journal.flags);
+ h = trans->hooks;
+ while (h) {
+ ret = h->fn(trans, h);
+ if (ret)
+ return ret;
+ h = h->next;
+ }
+
trans_for_each_update2(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
@@ -556,6 +573,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
if (trans->flags & BTREE_INSERT_NOUNLOCK)
trans->nounlock = true;
+ if (!(trans->flags & BTREE_INSERT_NOUNLOCK))
trans_for_each_update2(trans, i)
if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
!same_leaf_as_prev(trans, i))
@@ -826,7 +844,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
struct btree_insert_entry *i = NULL;
struct btree_iter *iter;
bool trans_trigger_run;
- unsigned u64s;
+ unsigned u64s, reset_flags = 0;
int ret = 0;
if (!trans->nr_updates)
@@ -940,7 +958,11 @@ out:
if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
percpu_ref_put(&trans->c->writes);
out_reset:
- bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0);
+ if (!ret)
+ reset_flags |= TRANS_RESET_NOTRAVERSE;
+ if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK))
+ reset_flags |= TRANS_RESET_NOUNLOCK;
+ bch2_trans_reset(trans, reset_flags);
return ret;
err:
@@ -1053,6 +1075,13 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
return 0;
}
+void bch2_trans_commit_hook(struct btree_trans *trans,
+ struct btree_trans_commit_hook *h)
+{
+ h->next = trans->hooks;
+ trans->hooks = h;
+}
+
int __bch2_btree_insert(struct btree_trans *trans,
enum btree_id id, struct bkey_i *k)
{
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index c6d49f44..acf60038 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -222,7 +222,9 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
bch2_trans_init(&trans, i->c, 0, 0);
- iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+ iter = bch2_trans_get_iter(&trans, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
k = bch2_btree_iter_peek(iter);
while (k.k && !(err = bkey_err(k))) {
@@ -273,7 +275,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
if (err)
return err;
- if (!i->size || !bkey_cmp(POS_MAX, i->from))
+ if (!i->size || !bpos_cmp(POS_MAX, i->from))
return i->ret;
bch2_trans_init(&trans, i->c, 0, 0);
@@ -289,8 +291,8 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
* can't easily correctly restart a btree node traversal across
* all nodes, meh
*/
- i->from = bkey_cmp(POS_MAX, b->key.k.p)
- ? bkey_successor(b->key.k.p)
+ i->from = bpos_cmp(POS_MAX, b->key.k.p)
+ ? bpos_successor(b->key.k.p)
: b->key.k.p;
if (!i->size)
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index 592dd80c..cf4ce2e7 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -141,7 +141,7 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
int bch2_dirent_create(struct btree_trans *trans,
u64 dir_inum, const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
- int flags)
+ u64 *dir_offset, int flags)
{
struct bkey_i_dirent *dirent;
int ret;
@@ -151,8 +151,11 @@ int bch2_dirent_create(struct btree_trans *trans,
if (ret)
return ret;
- return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, &dirent->k_i, flags);
+ ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+ dir_inum, &dirent->k_i, flags);
+ *dir_offset = dirent->k.p.offset;
+
+ return ret;
}
static void dirent_copy_target(struct bkey_i_dirent *dst,
@@ -165,8 +168,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
int bch2_dirent_rename(struct btree_trans *trans,
u64 src_dir, struct bch_hash_info *src_hash,
u64 dst_dir, struct bch_hash_info *dst_hash,
- const struct qstr *src_name, u64 *src_inum,
- const struct qstr *dst_name, u64 *dst_inum,
+ const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
+ const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
enum bch_rename_mode mode)
{
struct btree_iter *src_iter = NULL, *dst_iter = NULL;
@@ -255,7 +258,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
new_dst->k.p = src_iter->pos;
bch2_trans_update(trans, src_iter,
&new_dst->k_i, 0);
- goto out;
+ goto out_set_offset;
} else {
/* If we're overwriting, we can't insert new_dst
* at a different slot because it has to
@@ -278,6 +281,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+out_set_offset:
+ *src_offset = new_src->k.p.offset;
+ *dst_offset = new_dst->k.p.offset;
out:
bch2_trans_iter_put(trans, src_iter);
bch2_trans_iter_put(trans, dst_iter);
diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h
index 34769371..e1d8ce37 100644
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@@ -31,7 +31,7 @@ static inline unsigned dirent_val_u64s(unsigned len)
int bch2_dirent_create(struct btree_trans *, u64,
const struct bch_hash_info *, u8,
- const struct qstr *, u64, int);
+ const struct qstr *, u64, u64 *, int);
int bch2_dirent_delete_at(struct btree_trans *,
const struct bch_hash_info *,
@@ -46,8 +46,8 @@ enum bch_rename_mode {
int bch2_dirent_rename(struct btree_trans *,
u64, struct bch_hash_info *,
u64, struct bch_hash_info *,
- const struct qstr *, u64 *,
- const struct qstr *, u64 *,
+ const struct qstr *, u64 *, u64 *,
+ const struct qstr *, u64 *, u64 *,
enum bch_rename_mode);
struct btree_iter *
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 1dba7e99..f712f685 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -873,6 +873,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
if (ret)
break;
}
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index a7e04082..b07d3955 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -180,7 +180,8 @@ const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
return "value too big";
- if (bp.v->min_key.snapshot)
+ if (c->sb.version < bcachefs_metadata_version_snapshot &&
+ bp.v->min_key.snapshot)
return "invalid min_key.snapshot";
return bch2_bkey_ptrs_invalid(c, k);
@@ -212,8 +213,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
btree_node_type_is_extents(btree_id) &&
bkey_cmp(bp.v->min_key, POS_MIN))
bp.v->min_key = write
- ? bkey_predecessor(bp.v->min_key)
- : bkey_successor(bp.v->min_key);
+ ? bpos_nosnap_predecessor(bp.v->min_key)
+ : bpos_nosnap_successor(bp.v->min_key);
}
/* KEY_TYPE_extent: */
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index c8069dfb..ccee43a2 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -582,6 +582,24 @@ void bch2_ptr_swab(struct bkey_s);
/* Generic extent code: */
+enum bch_extent_overlap {
+ BCH_EXTENT_OVERLAP_ALL = 0,
+ BCH_EXTENT_OVERLAP_BACK = 1,
+ BCH_EXTENT_OVERLAP_FRONT = 2,
+ BCH_EXTENT_OVERLAP_MIDDLE = 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+ const struct bkey *m)
+{
+ int cmp1 = bkey_cmp(k->p, m->p) < 0;
+ int cmp2 = bkey_cmp(bkey_start_pos(k),
+ bkey_start_pos(m)) > 0;
+
+ return (cmp1 << 1) + cmp2;
+}
+
int bch2_cut_front_s(struct bpos, struct bkey_s);
int bch2_cut_back_s(struct bpos, struct bkey_s);
diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c
index 503ce192..83c2168c 100644
--- a/libbcachefs/fs-common.c
+++ b/libbcachefs/fs-common.c
@@ -20,8 +20,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
{
struct bch_fs *c = trans->c;
struct btree_iter *dir_iter = NULL;
+ struct btree_iter *inode_iter = NULL;
struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
- u64 now = bch2_current_time(trans->c);
+ u64 now = bch2_current_time(c);
+ u64 dir_offset = 0;
int ret;
dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
@@ -34,7 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
if (!name)
new_inode->bi_flags |= BCH_INODE_UNLINKED;
- ret = bch2_inode_create(trans, new_inode);
+ inode_iter = bch2_inode_create(trans, new_inode);
+ ret = PTR_ERR_OR_ZERO(inode_iter);
if (ret)
goto err;
@@ -66,11 +69,20 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
mode_to_type(new_inode->bi_mode),
name, new_inode->bi_inum,
+ &dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
goto err;
}
+
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+ new_inode->bi_dir = dir_u->bi_inum;
+ new_inode->bi_dir_offset = dir_offset;
+ }
+
+ ret = bch2_inode_write(trans, inode_iter, new_inode);
err:
+ bch2_trans_iter_put(trans, inode_iter);
bch2_trans_iter_put(trans, dir_iter);
return ret;
}
@@ -79,9 +91,11 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
u64 inum, struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u, const struct qstr *name)
{
+ struct bch_fs *c = trans->c;
struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
struct bch_hash_info dir_hash;
- u64 now = bch2_current_time(trans->c);
+ u64 now = bch2_current_time(c);
+ u64 dir_offset = 0;
int ret;
inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
@@ -92,6 +106,8 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
inode_u->bi_ctime = now;
bch2_inode_nlink_inc(inode_u);
+ inode_u->bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+
dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
ret = PTR_ERR_OR_ZERO(dir_iter);
if (ret)
@@ -99,12 +115,21 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
dir_u->bi_mtime = dir_u->bi_ctime = now;
- dir_hash = bch2_hash_info_init(trans->c, dir_u);
+ dir_hash = bch2_hash_info_init(c, dir_u);
- ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
- mode_to_type(inode_u->bi_mode),
- name, inum, BCH_HASH_SET_MUST_CREATE) ?:
- bch2_inode_write(trans, dir_iter, dir_u) ?:
+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+ mode_to_type(inode_u->bi_mode),
+ name, inum, &dir_offset,
+ BCH_HASH_SET_MUST_CREATE);
+ if (ret)
+ goto err;
+
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+ inode_u->bi_dir = dir_inum;
+ inode_u->bi_dir_offset = dir_offset;
+ }
+
+ ret = bch2_inode_write(trans, dir_iter, dir_u) ?:
bch2_inode_write(trans, inode_iter, inode_u);
err:
bch2_trans_iter_put(trans, dir_iter);
@@ -117,10 +142,11 @@ int bch2_unlink_trans(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
const struct qstr *name)
{
+ struct bch_fs *c = trans->c;
struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
*inode_iter = NULL;
struct bch_hash_info dir_hash;
- u64 inum, now = bch2_current_time(trans->c);
+ u64 inum, now = bch2_current_time(c);
struct bkey_s_c k;
int ret;
@@ -129,7 +155,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
if (ret)
goto err;
- dir_hash = bch2_hash_info_init(trans->c, dir_u);
+ dir_hash = bch2_hash_info_init(c, dir_u);
dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
name, BTREE_ITER_INTENT);
@@ -195,10 +221,12 @@ int bch2_rename_trans(struct btree_trans *trans,
const struct qstr *dst_name,
enum bch_rename_mode mode)
{
+ struct bch_fs *c = trans->c;
struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
struct bch_hash_info src_hash, dst_hash;
- u64 src_inode, dst_inode, now = bch2_current_time(trans->c);
+ u64 src_inode, src_offset, dst_inode, dst_offset;
+ u64 now = bch2_current_time(c);
int ret;
src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
@@ -207,7 +235,7 @@ int bch2_rename_trans(struct btree_trans *trans,
if (ret)
goto err;
- src_hash = bch2_hash_info_init(trans->c, src_dir_u);
+ src_hash = bch2_hash_info_init(c, src_dir_u);
if (dst_dir != src_dir) {
dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
@@ -216,7 +244,7 @@ int bch2_rename_trans(struct btree_trans *trans,
if (ret)
goto err;
- dst_hash = bch2_hash_info_init(trans->c, dst_dir_u);
+ dst_hash = bch2_hash_info_init(c, dst_dir_u);
} else {
dst_dir_u = src_dir_u;
dst_hash = src_hash;
@@ -225,8 +253,8 @@ int bch2_rename_trans(struct btree_trans *trans,
ret = bch2_dirent_rename(trans,
src_dir, &src_hash,
dst_dir, &dst_hash,
- src_name, &src_inode,
- dst_name, &dst_inode,
+ src_name, &src_inode, &src_offset,
+ dst_name, &dst_inode, &dst_offset,
mode);
if (ret)
goto err;
@@ -245,6 +273,16 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+ src_inode_u->bi_dir = dst_dir_u->bi_inum;
+ src_inode_u->bi_dir_offset = dst_offset;
+
+ if (mode == BCH_RENAME_EXCHANGE) {
+ dst_inode_u->bi_dir = src_dir_u->bi_inum;
+ dst_inode_u->bi_dir_offset = src_offset;
+ }
+ }
+
if (mode == BCH_RENAME_OVERWRITE) {
if (S_ISDIR(src_inode_u->bi_mode) !=
S_ISDIR(dst_inode_u->bi_mode)) {
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 9dc162f2..62788ae1 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -675,6 +675,39 @@ retry:
continue;
}
+ if (!target.bi_nlink &&
+ !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+ (target.bi_dir != k.k->p.inode ||
+ target.bi_dir_offset != k.k->p.offset) &&
+ (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+ "inode %llu has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ d_inum,
+ target.bi_dir,
+ target.bi_dir_offset,
+ k.k->p.inode,
+ k.k->p.offset) ||
+ c->opts.version_upgrade)) {
+ struct bkey_inode_buf p;
+
+ target.bi_dir = k.k->p.inode;
+ target.bi_dir_offset = k.k->p.offset;
+ bch2_trans_unlock(&trans);
+
+ bch2_inode_pack(c, &p, &target);
+
+ ret = bch2_btree_insert(c, BTREE_ID_inodes,
+ &p.inode.k_i, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW);
+ if (ret) {
+ bch_err(c, "error in fsck: error %i updating inode", ret);
+ goto err;
+ }
+ continue;
+ }
+
if (fsck_err_on(have_target &&
d.v->d_type !=
mode_to_type(target.bi_mode), c,
@@ -1314,6 +1347,16 @@ static int check_inode(struct btree_trans *trans,
do_update = true;
}
+ if (!S_ISDIR(u.bi_mode) &&
+ u.bi_nlink &&
+ !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+ (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+ "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") ||
+ c->opts.version_upgrade)) {
+ u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+ do_update = true;
+ }
+
if (do_update) {
struct bkey_inode_buf p;
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 4559e77f..f1665ca8 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -332,6 +332,7 @@ int bch2_inode_write(struct btree_trans *trans,
return PTR_ERR(inode_p);
bch2_inode_pack(trans->c, inode_p, inode);
+ inode_p->inode.k.p.snapshot = iter->snapshot;
bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
return 0;
}
@@ -469,11 +470,10 @@ static inline u32 bkey_generation(struct bkey_s_c k)
}
}
-int bch2_inode_create(struct btree_trans *trans,
- struct bch_inode_unpacked *inode_u)
+struct btree_iter *bch2_inode_create(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode_u)
{
struct bch_fs *c = trans->c;
- struct bkey_inode_buf *inode_p;
struct btree_iter *iter = NULL;
struct bkey_s_c k;
u64 min, max, start, *hint;
@@ -493,10 +493,6 @@ int bch2_inode_create(struct btree_trans *trans,
if (start >= max || start < min)
start = min;
-
- inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
- if (IS_ERR(inode_p))
- return PTR_ERR(inode_p);
again:
for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
@@ -520,7 +516,7 @@ again:
bch2_trans_iter_put(trans, iter);
if (ret)
- return ret;
+ return ERR_PTR(ret);
if (start != min) {
/* Retry from start */
@@ -528,15 +524,12 @@ again:
goto again;
}
- return -ENOSPC;
+ return ERR_PTR(-ENOSPC);
found_slot:
*hint = k.k->p.offset;
inode_u->bi_inum = k.k->p.offset;
inode_u->bi_generation = bkey_generation(k);
-
- ret = bch2_inode_write(trans, iter, inode_u);
- bch2_trans_iter_put(trans, iter);
- return ret;
+ return iter;
}
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 1caf036a..6bad6dfb 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -69,7 +69,8 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
-int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
+struct btree_iter *bch2_inode_create(struct btree_trans *,
+ struct bch_inode_unpacked *);
int bch2_inode_rm(struct bch_fs *, u64, bool);
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 284d398b..36b10cb7 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -322,6 +322,9 @@ int bch2_extent_update(struct btree_trans *trans,
if (i_sectors_delta || new_i_size) {
bch2_inode_pack(trans->c, &inode_p, &inode_u);
+
+ inode_p.inode.k.p.snapshot = iter->snapshot;
+
bch2_trans_update(trans, inode_iter,
&inode_p.inode.k_i, 0);
}
@@ -437,6 +440,8 @@ int bch2_write_index_default(struct bch_write_op *op)
k = bch2_keylist_front(keys);
+ k->k.p.snapshot = iter->snapshot;
+
bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
bkey_copy(sk.k, k);
bch2_cut_front(iter->pos, sk.k);
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 1f26139d..69c553a6 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -914,14 +914,17 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
if (dynamic_fault("bcachefs:add:journal_alloc"))
return -ENOMEM;
+ /* 1/128th of the device by default: */
+ nr = ca->mi.nbuckets >> 7;
+
/*
- * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+ * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
* is smaller:
*/
- nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
+ nr = clamp_t(unsigned, nr,
BCH_JOURNAL_BUCKETS_MIN,
- min(1 << 10,
- (1 << 20) / ca->mi.bucket_size));
+ min(1 << 13,
+ (1 << 24) / ca->mi.bucket_size));
return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
}
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 54f2e205..c7fa03cf 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1452,7 +1452,7 @@ void bch2_journal_write(struct closure *cl)
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;
- if (le32_to_cpu(jset->version) <= bcachefs_metadata_version_inode_btree_change)
+ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
validate_before_checksum = true;
if (validate_before_checksum &&
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index bbf8e5ad..4a5b50ed 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -610,8 +610,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
j->prereserved.remaining,
atomic_read(&c->btree_cache.dirty),
c->btree_cache.used,
- c->btree_key_cache.nr_dirty,
- c->btree_key_cache.nr_keys);
+ atomic_long_read(&c->btree_key_cache.nr_dirty),
+ atomic_long_read(&c->btree_key_cache.nr_keys));
nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr);
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index f863fd74..3d1bf87e 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -48,14 +48,14 @@ static int __journal_key_cmp(enum btree_id l_btree_id,
{
return (cmp_int(l_btree_id, r->btree_id) ?:
cmp_int(l_level, r->level) ?:
- bkey_cmp(l_pos, r->k->k.p));
+ bpos_cmp(l_pos, r->k->k.p));
}
static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
{
return (cmp_int(l->btree_id, r->btree_id) ?:
cmp_int(l->level, r->level) ?:
- bkey_cmp(l->k->k.p, r->k->k.p));
+ bpos_cmp(l->k->k.p, r->k->k.p));
}
static size_t journal_key_search(struct journal_keys *journal_keys,
@@ -90,7 +90,7 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign
if (iter->idx > idx ||
(iter->idx == idx &&
biter->last &&
- bkey_cmp(n->k.p, biter->unpacked.p) <= 0))
+ bpos_cmp(n->k.p, biter->unpacked.p) <= 0))
iter->idx++;
}
@@ -238,7 +238,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
if (btree_k.k && journal_k.k) {
- int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
+ int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p);
if (!cmp)
bch2_journal_iter_advance_btree(iter);
@@ -256,7 +256,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
ret = iter->last == journal ? journal_k : btree_k;
if (iter->b &&
- bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
+ bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) {
iter->journal.idx = iter->journal.keys->nr;
iter->last = none;
return bkey_s_c_null;
@@ -419,7 +419,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
return cmp_int(l->btree_id, r->btree_id) ?:
cmp_int(l->level, r->level) ?:
- bkey_cmp(l->k->k.p, r->k->k.p) ?:
+ bpos_cmp(l->k->k.p, r->k->k.p) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
@@ -490,7 +490,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
while (src + 1 < keys.d + keys.nr &&
src[0].btree_id == src[1].btree_id &&
src[0].level == src[1].level &&
- !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
+ !bpos_cmp(src[0].k->k.p, src[1].k->k.p))
src++;
*dst++ = *src++;
@@ -581,7 +581,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
return cmp_int(r->level, l->level) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->btree_id, r->btree_id) ?:
- bkey_cmp(l->k->k.p, r->k->k.p);
+ bpos_cmp(l->k->k.p, r->k->k.p);
}
static int bch2_journal_replay(struct bch_fs *c,
@@ -998,6 +998,13 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) {
+ bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
+ ret = -EINVAL;
+ goto err;
+
+ }
+
if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
bch_info(c, "alloc_v2 feature bit not set, fsck required");
c->opts.fsck = true;
@@ -1338,6 +1345,7 @@ int bch2_fs_initialize(struct bch_fs *c)
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
root_inode.bi_inum = BCACHEFS_ROOT_INO;
bch2_inode_pack(c, &packed_inode, &root_inode);
+ packed_inode.inode.k.p.snapshot = U32_MAX;
err = "error creating root directory";
ret = bch2_btree_insert(c, BTREE_ID_inodes,
diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c
index dfb12fdd..7507b6bc 100644
--- a/libbcachefs/tests.c
+++ b/libbcachefs/tests.c
@@ -67,6 +67,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
goto err;
}
err:
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -106,6 +107,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
goto err;
}
err:
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -113,7 +115,7 @@ err:
static int test_iterate(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter *iter = NULL;
struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -159,6 +161,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
BUG_ON(i);
err:
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -166,7 +169,7 @@ err:
static int test_iterate_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter *iter = NULL;
struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -213,6 +216,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
BUG_ON(i);
err:
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -257,7 +261,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
BUG_ON(k.k->p.offset != i);
i += 2;
}
- bch2_trans_iter_free(&trans, iter);
+ bch2_trans_iter_put(&trans, iter);
BUG_ON(i != nr * 2);
@@ -274,6 +278,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
if (i == nr * 2)
break;
}
+ bch2_trans_iter_put(&trans, iter);
err:
bch2_trans_exit(&trans);
return ret;
@@ -318,7 +323,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
BUG_ON(k.k->size != 8);
i += 16;
}
- bch2_trans_iter_free(&trans, iter);
+ bch2_trans_iter_put(&trans, iter);
BUG_ON(i != nr);
@@ -337,6 +342,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
if (i == nr)
break;
}
+ bch2_trans_iter_put(&trans, iter);
err:
bch2_trans_exit(&trans);
return 0;
@@ -362,6 +368,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
k = bch2_btree_iter_peek(iter);
BUG_ON(k.k);
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
return 0;
}
@@ -382,6 +390,8 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
k = bch2_btree_iter_peek(iter);
BUG_ON(k.k);
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
return 0;
}
@@ -473,6 +483,7 @@ static int rand_insert(struct bch_fs *c, u64 nr)
for (i = 0; i < nr; i++) {
bkey_cookie_init(&k.k_i);
k.k.p.offset = test_rand();
+ k.k.p.snapshot = U32_MAX;
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
@@ -508,7 +519,7 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
}
}
- bch2_trans_iter_free(&trans, iter);
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -549,7 +560,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
}
}
- bch2_trans_iter_free(&trans, iter);
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -630,6 +641,8 @@ static int seq_insert(struct bch_fs *c, u64 nr)
if (++i == nr)
break;
}
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
return ret;
}
@@ -645,6 +658,8 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
;
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
return ret;
}
@@ -671,6 +686,8 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
break;
}
}
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
return ret;
}
diff --git a/linux/rhashtable.c b/linux/rhashtable.c
index 351eac79..ba2196fc 100644
--- a/linux/rhashtable.c
+++ b/linux/rhashtable.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Resizable, Scalable, Concurrent Hash Table
*
@@ -8,27 +9,29 @@
* Code partially derived from nft_hash
* Rewritten with rehash code from br_multicast plus single list
* pointer as suggested by Josh Triplett
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/atomic.h>
-#include <linux/cpumask.h>
#include <linux/kernel.h>
#include <linux/log2.h>
#include <linux/sched.h>
+#include <linux/rculist.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/jhash.h>
+#include <linux/overflow.h>
#include <linux/random.h>
#include <linux/rhashtable.h>
#include <linux/err.h>
+#include <linux/export.h>
#define HASH_DEFAULT_SIZE 64UL
#define HASH_MIN_SIZE 4U
-#define BUCKET_LOCKS_PER_CPU 32UL
+
+union nested_table {
+ union nested_table __rcu *table;
+ struct rhash_lock_head __rcu *bucket;
+};
static u32 head_hashfn(struct rhashtable *ht,
const struct bucket_table *tbl,
@@ -37,40 +40,75 @@ static u32 head_hashfn(struct rhashtable *ht,
return rht_head_hashfn(ht, tbl, he, ht->p);
}
-static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
- gfp_t gfp)
-{
- unsigned int i, size;
- unsigned int nr_pcpus = num_possible_cpus();
+#ifdef CONFIG_PROVE_LOCKING
+#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
- nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL);
- size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
+{
+ return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
- /* Never allocate more than 0.5 locks per bucket */
- size = min_t(unsigned int, size, tbl->size >> 1);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
+{
+ if (!debug_locks)
+ return 1;
+ if (unlikely(tbl->nest))
+ return 1;
+ return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]);
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
+#else
+#define ASSERT_RHT_MUTEX(HT)
+#endif
- if (sizeof(spinlock_t) != 0) {
- tbl->locks = NULL;
- if (gfp != GFP_KERNEL)
- gfp |= __GFP_NOWARN | __GFP_NORETRY;
+static inline union nested_table *nested_table_top(
+ const struct bucket_table *tbl)
+{
+ /* The top-level bucket entry does not need RCU protection
+ * because it's set at the same time as tbl->nest.
+ */
+ return (void *)rcu_dereference_protected(tbl->buckets[0], 1);
+}
- if (!tbl->locks)
- tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
- gfp);
- if (!tbl->locks)
- return -ENOMEM;
- for (i = 0; i < size; i++)
- spin_lock_init(&tbl->locks[i]);
+static void nested_table_free(union nested_table *ntbl, unsigned int size)
+{
+ const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+ const unsigned int len = 1 << shift;
+ unsigned int i;
+
+ ntbl = rcu_dereference_protected(ntbl->table, 1);
+ if (!ntbl)
+ return;
+
+ if (size > len) {
+ size >>= shift;
+ for (i = 0; i < len; i++)
+ nested_table_free(ntbl + i, size);
}
- tbl->locks_mask = size - 1;
- return 0;
+ kfree(ntbl);
+}
+
+static void nested_bucket_table_free(const struct bucket_table *tbl)
+{
+ unsigned int size = tbl->size >> tbl->nest;
+ unsigned int len = 1 << tbl->nest;
+ union nested_table *ntbl;
+ unsigned int i;
+
+ ntbl = nested_table_top(tbl);
+
+ for (i = 0; i < len; i++)
+ nested_table_free(ntbl + i, size);
+
+ kfree(ntbl);
}
static void bucket_table_free(struct bucket_table *tbl)
{
- if (tbl)
- kvfree(tbl->locks);
+ if (tbl->nest)
+ nested_bucket_table_free(tbl);
kvfree(tbl);
}
@@ -80,6 +118,59 @@ static void bucket_table_free_rcu(struct rcu_head *head)
bucket_table_free(container_of(head, struct bucket_table, rcu));
}
+static union nested_table *nested_table_alloc(struct rhashtable *ht,
+ union nested_table __rcu **prev,
+ bool leaf)
+{
+ union nested_table *ntbl;
+ int i;
+
+ ntbl = rcu_dereference(*prev);
+ if (ntbl)
+ return ntbl;
+
+ ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
+
+ if (ntbl && leaf) {
+ for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
+ INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
+ }
+
+ if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL)
+ return ntbl;
+ /* Raced with another thread. */
+ kfree(ntbl);
+ return rcu_dereference(*prev);
+}
+
+static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
+ size_t nbuckets,
+ gfp_t gfp)
+{
+ const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+ struct bucket_table *tbl;
+ size_t size;
+
+ if (nbuckets < (1 << (shift + 1)))
+ return NULL;
+
+ size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
+
+ tbl = kzalloc(size, gfp);
+ if (!tbl)
+ return NULL;
+
+ if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
+ false)) {
+ kfree(tbl);
+ return NULL;
+ }
+
+ tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
+
+ return tbl;
+}
+
static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
size_t nbuckets,
gfp_t gfp)
@@ -88,28 +179,27 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
size_t size;
int i;
- size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
- if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
- gfp != GFP_KERNEL)
- tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
- if (tbl == NULL && gfp == GFP_KERNEL)
- tbl = vzalloc(size);
- if (tbl == NULL)
- return NULL;
+ tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp);
- tbl->size = nbuckets;
+ size = nbuckets;
- if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
- bucket_table_free(tbl);
- return NULL;
+ if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
+ tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
+ nbuckets = 0;
}
+ if (tbl == NULL)
+ return NULL;
+
+ tbl->size = size;
+
+ rcu_head_init(&tbl->rcu);
INIT_LIST_HEAD(&tbl->walkers);
- get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+ tbl->hash_rnd = get_random_u32();
for (i = 0; i < nbuckets; i++)
- INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
+ INIT_RHT_NULLS_HEAD(tbl->buckets[i]);
return tbl;
}
@@ -127,18 +217,24 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
return new_tbl;
}
-static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
+static int rhashtable_rehash_one(struct rhashtable *ht,
+ struct rhash_lock_head __rcu **bkt,
+ unsigned int old_hash)
{
struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
- struct bucket_table *new_tbl = rhashtable_last_table(ht,
- rht_dereference_rcu(old_tbl->future_tbl, ht));
- struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
- int err = -ENOENT;
+ struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
+ int err = -EAGAIN;
struct rhash_head *head, *next, *entry;
- spinlock_t *new_bucket_lock;
+ struct rhash_head __rcu **pprev = NULL;
unsigned int new_hash;
- rht_for_each(entry, old_tbl, old_hash) {
+ if (new_tbl->nest)
+ goto out;
+
+ err = -ENOENT;
+
+ rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash),
+ old_tbl, old_hash) {
err = 0;
next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
@@ -153,57 +249,58 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
new_hash = head_hashfn(ht, new_tbl, entry);
- new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
+ rht_lock(new_tbl, &new_tbl->buckets[new_hash]);
- spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
- head = rht_dereference_bucket(new_tbl->buckets[new_hash],
- new_tbl, new_hash);
+ head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);
RCU_INIT_POINTER(entry->next, head);
- rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
- spin_unlock(new_bucket_lock);
+ rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry);
- rcu_assign_pointer(*pprev, next);
+ if (pprev)
+ rcu_assign_pointer(*pprev, next);
+ else
+ /* Need to preserved the bit lock. */
+ rht_assign_locked(bkt, next);
out:
return err;
}
-static void rhashtable_rehash_chain(struct rhashtable *ht,
+static int rhashtable_rehash_chain(struct rhashtable *ht,
unsigned int old_hash)
{
struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
- spinlock_t *old_bucket_lock;
+ struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
+ int err;
- old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
+ if (!bkt)
+ return 0;
+ rht_lock(old_tbl, bkt);
- spin_lock_bh(old_bucket_lock);
- while (!rhashtable_rehash_one(ht, old_hash))
+ while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
;
- old_tbl->rehash++;
- spin_unlock_bh(old_bucket_lock);
+
+ if (err == -ENOENT)
+ err = 0;
+ rht_unlock(old_tbl, bkt);
+
+ return err;
}
static int rhashtable_rehash_attach(struct rhashtable *ht,
struct bucket_table *old_tbl,
struct bucket_table *new_tbl)
{
- /* Protect future_tbl using the first bucket lock. */
- spin_lock_bh(old_tbl->locks);
-
- /* Did somebody beat us to it? */
- if (rcu_access_pointer(old_tbl->future_tbl)) {
- spin_unlock_bh(old_tbl->locks);
- return -EEXIST;
- }
-
/* Make insertions go into the new, empty table right away. Deletions
* and lookups will be attempted in both tables until we synchronize.
+ * As cmpxchg() provides strong barriers, we do not need
+ * rcu_assign_pointer().
*/
- rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
- spin_unlock_bh(old_tbl->locks);
+ if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL,
+ new_tbl) != NULL)
+ return -EEXIST;
return 0;
}
@@ -214,13 +311,18 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
struct bucket_table *new_tbl;
struct rhashtable_walker *walker;
unsigned int old_hash;
+ int err;
new_tbl = rht_dereference(old_tbl->future_tbl, ht);
if (!new_tbl)
return 0;
- for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
- rhashtable_rehash_chain(ht, old_hash);
+ for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+ err = rhashtable_rehash_chain(ht, old_hash);
+ if (err)
+ return err;
+ cond_resched();
+ }
/* Publish the new table pointer. */
rcu_assign_pointer(ht->tbl, new_tbl);
@@ -228,25 +330,30 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
spin_lock(&ht->lock);
list_for_each_entry(walker, &old_tbl->walkers, list)
walker->tbl = NULL;
- spin_unlock(&ht->lock);
/* Wait for readers. All new readers will see the new
* table, and thus no references to the old table will
* remain.
+ * We do this inside the locked region so that
+ * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
+ * to check if it should not re-link the table.
*/
call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
+ spin_unlock(&ht->lock);
return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
}
-static int rhashtable_expand(struct rhashtable *ht)
+static int rhashtable_rehash_alloc(struct rhashtable *ht,
+ struct bucket_table *old_tbl,
+ unsigned int size)
{
- struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+ struct bucket_table *new_tbl;
int err;
- old_tbl = rhashtable_last_table(ht, old_tbl);
+ ASSERT_RHT_MUTEX(ht);
- new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
+ new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
if (new_tbl == NULL)
return -ENOMEM;
@@ -257,12 +364,27 @@ static int rhashtable_expand(struct rhashtable *ht)
return err;
}
+/**
+ * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
+ * @ht: the hash table to shrink
+ *
+ * This function shrinks the hash table to fit, i.e., the smallest
+ * size would not cause it to expand right away automatically.
+ *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * The caller must ensure that no concurrent table mutations take place.
+ * It is however valid to have concurrent lookups if they are RCU protected.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
+ */
static int rhashtable_shrink(struct rhashtable *ht)
{
- struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+ struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
unsigned int nelems = atomic_read(&ht->nelems);
unsigned int size = 0;
- int err;
if (nelems)
size = roundup_pow_of_two(nelems * 3 / 2);
@@ -275,15 +397,7 @@ static int rhashtable_shrink(struct rhashtable *ht)
if (rht_dereference(old_tbl->future_tbl, ht))
return -EEXIST;
- new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
- if (new_tbl == NULL)
- return -ENOMEM;
-
- err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
- if (err)
- bucket_table_free(new_tbl);
-
- return err;
+ return rhashtable_rehash_alloc(ht, old_tbl, size);
}
static void rht_deferred_worker(struct work_struct *work)
@@ -299,11 +413,18 @@ static void rht_deferred_worker(struct work_struct *work)
tbl = rhashtable_last_table(ht, tbl);
if (rht_grow_above_75(ht, tbl))
- rhashtable_expand(ht);
+ err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
- rhashtable_shrink(ht);
+ err = rhashtable_shrink(ht);
+ else if (tbl->nest)
+ err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
+
+ if (!err || err == -EEXIST) {
+ int nerr;
- err = rhashtable_rehash_table(ht);
+ nerr = rhashtable_rehash_table(ht);
+ err = err ?: nerr;
+ }
mutex_unlock(&ht->mutex);
@@ -311,22 +432,8 @@ static void rht_deferred_worker(struct work_struct *work)
schedule_work(&ht->run_work);
}
-static bool rhashtable_check_elasticity(struct rhashtable *ht,
- struct bucket_table *tbl,
- unsigned int hash)
-{
- unsigned int elasticity = ht->elasticity;
- struct rhash_head *head;
-
- rht_for_each(head, tbl, hash)
- if (!--elasticity)
- return true;
-
- return false;
-}
-
-int rhashtable_insert_rehash(struct rhashtable *ht,
- struct bucket_table *tbl)
+static int rhashtable_insert_rehash(struct rhashtable *ht,
+ struct bucket_table *tbl)
{
struct bucket_table *old_tbl;
struct bucket_table *new_tbl;
@@ -347,7 +454,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht,
err = -ENOMEM;
- new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
+ new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
if (new_tbl == NULL)
goto fail;
@@ -363,7 +470,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht,
fail:
/* Do not fail the insert if someone else did a rehash. */
- if (likely(rcu_dereference_raw(tbl->future_tbl)))
+ if (likely(rcu_access_pointer(tbl->future_tbl)))
return 0;
/* Schedule async rehash to retry allocation in process context. */
@@ -373,57 +480,485 @@ fail:
return err;
}
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
- const void *key,
- struct rhash_head *obj,
- struct bucket_table *tbl)
+static void *rhashtable_lookup_one(struct rhashtable *ht,
+ struct rhash_lock_head __rcu **bkt,
+ struct bucket_table *tbl, unsigned int hash,
+ const void *key, struct rhash_head *obj)
{
+ struct rhashtable_compare_arg arg = {
+ .ht = ht,
+ .key = key,
+ };
+ struct rhash_head __rcu **pprev = NULL;
struct rhash_head *head;
- unsigned int hash;
- int err;
+ int elasticity;
+
+ elasticity = RHT_ELASTICITY;
+ rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
+ struct rhlist_head *list;
+ struct rhlist_head *plist;
+
+ elasticity--;
+ if (!key ||
+ (ht->p.obj_cmpfn ?
+ ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
+ rhashtable_compare(&arg, rht_obj(ht, head)))) {
+ pprev = &head->next;
+ continue;
+ }
- tbl = rhashtable_last_table(ht, tbl);
- hash = head_hashfn(ht, tbl, obj);
- spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
+ if (!ht->rhlist)
+ return rht_obj(ht, head);
- err = -EEXIST;
- if (key && rhashtable_lookup_fast(ht, key, ht->p))
- goto exit;
+ list = container_of(obj, struct rhlist_head, rhead);
+ plist = container_of(head, struct rhlist_head, rhead);
- err = -E2BIG;
- if (unlikely(rht_grow_above_max(ht, tbl)))
- goto exit;
+ RCU_INIT_POINTER(list->next, plist);
+ head = rht_dereference_bucket(head->next, tbl, hash);
+ RCU_INIT_POINTER(list->rhead.next, head);
+ if (pprev)
+ rcu_assign_pointer(*pprev, obj);
+ else
+ /* Need to preserve the bit lock */
+ rht_assign_locked(bkt, obj);
+
+ return NULL;
+ }
+
+ if (elasticity <= 0)
+ return ERR_PTR(-EAGAIN);
+
+ return ERR_PTR(-ENOENT);
+}
+
+static struct bucket_table *rhashtable_insert_one(
+ struct rhashtable *ht, struct rhash_lock_head __rcu **bkt,
+ struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj,
+ void *data)
+{
+ struct bucket_table *new_tbl;
+ struct rhash_head *head;
+
+ if (!IS_ERR_OR_NULL(data))
+ return ERR_PTR(-EEXIST);
+
+ if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
+ return ERR_CAST(data);
+
+ new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+ if (new_tbl)
+ return new_tbl;
+
+ if (PTR_ERR(data) != -ENOENT)
+ return ERR_CAST(data);
- err = -EAGAIN;
- if (rhashtable_check_elasticity(ht, tbl, hash) ||
- rht_grow_above_100(ht, tbl))
- goto exit;
+ if (unlikely(rht_grow_above_max(ht, tbl)))
+ return ERR_PTR(-E2BIG);
- err = 0;
+ if (unlikely(rht_grow_above_100(ht, tbl)))
+ return ERR_PTR(-EAGAIN);
- head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+ head = rht_ptr(bkt, tbl, hash);
RCU_INIT_POINTER(obj->next, head);
+ if (ht->rhlist) {
+ struct rhlist_head *list;
- rcu_assign_pointer(tbl->buckets[hash], obj);
+ list = container_of(obj, struct rhlist_head, rhead);
+ RCU_INIT_POINTER(list->next, NULL);
+ }
+
+ /* bkt is always the head of the list, so it holds
+ * the lock, which we need to preserve
+ */
+ rht_assign_locked(bkt, obj);
atomic_inc(&ht->nelems);
+ if (rht_grow_above_75(ht, tbl))
+ schedule_work(&ht->run_work);
+
+ return NULL;
+}
+
+static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
+ struct rhash_head *obj)
+{
+ struct bucket_table *new_tbl;
+ struct bucket_table *tbl;
+ struct rhash_lock_head __rcu **bkt;
+ unsigned int hash;
+ void *data;
+
+ new_tbl = rcu_dereference(ht->tbl);
+
+ do {
+ tbl = new_tbl;
+ hash = rht_head_hashfn(ht, tbl, obj, ht->p);
+ if (rcu_access_pointer(tbl->future_tbl))
+ /* Failure is OK */
+ bkt = rht_bucket_var(tbl, hash);
+ else
+ bkt = rht_bucket_insert(ht, tbl, hash);
+ if (bkt == NULL) {
+ new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+ data = ERR_PTR(-EAGAIN);
+ } else {
+ rht_lock(tbl, bkt);
+ data = rhashtable_lookup_one(ht, bkt, tbl,
+ hash, key, obj);
+ new_tbl = rhashtable_insert_one(ht, bkt, tbl,
+ hash, obj, data);
+ if (PTR_ERR(new_tbl) != -EEXIST)
+ data = ERR_CAST(new_tbl);
+
+ rht_unlock(tbl, bkt);
+ }
+ } while (!IS_ERR_OR_NULL(new_tbl));
+
+ if (PTR_ERR(data) == -EAGAIN)
+ data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
+ -EAGAIN);
+
+ return data;
+}
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+ struct rhash_head *obj)
+{
+ void *data;
+
+ do {
+ rcu_read_lock();
+ data = rhashtable_try_insert(ht, key, obj);
+ rcu_read_unlock();
+ } while (PTR_ERR(data) == -EAGAIN);
-exit:
- spin_unlock(rht_bucket_lock(tbl, hash));
+ return data;
+}
+EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
- if (err == 0)
+/**
+ * rhashtable_walk_enter - Initialise an iterator
+ * @ht: Table to walk over
+ * @iter: Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice. Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
+{
+ iter->ht = ht;
+ iter->p = NULL;
+ iter->slot = 0;
+ iter->skip = 0;
+ iter->end_of_table = 0;
+
+ spin_lock(&ht->lock);
+ iter->walker.tbl =
+ rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
+ list_add(&iter->walker.list, &iter->walker.tbl->walkers);
+ spin_unlock(&ht->lock);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
+
+/**
+ * rhashtable_walk_exit - Free an iterator
+ * @iter: Hash table Iterator
+ *
+ * This function frees resources allocated by rhashtable_walk_enter.
+ */
+void rhashtable_walk_exit(struct rhashtable_iter *iter)
+{
+ spin_lock(&iter->ht->lock);
+ if (iter->walker.tbl)
+ list_del(&iter->walker.list);
+ spin_unlock(&iter->ht->lock);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
+
+/**
+ * rhashtable_walk_start_check - Start a hash table walk
+ * @iter: Hash table iterator
+ *
+ * Start a hash table walk at the current iterator position. Note that we take
+ * the RCU lock in all cases including when we return an error. So you must
+ * always call rhashtable_walk_stop to clean up.
+ *
+ * Returns zero if successful.
+ *
+ * Returns -EAGAIN if resize event occured. Note that the iterator
+ * will rewind back to the beginning and you may use it immediately
+ * by calling rhashtable_walk_next.
+ *
+ * rhashtable_walk_start is defined as an inline variant that returns
+ * void. This is preferred in cases where the caller would ignore
+ * resize events and always continue.
+ */
+int rhashtable_walk_start_check(struct rhashtable_iter *iter)
+ __acquires(RCU)
+{
+ struct rhashtable *ht = iter->ht;
+ bool rhlist = ht->rhlist;
+
+ rcu_read_lock();
+
+ spin_lock(&ht->lock);
+ if (iter->walker.tbl)
+ list_del(&iter->walker.list);
+ spin_unlock(&ht->lock);
+
+ if (iter->end_of_table)
+ return 0;
+ if (!iter->walker.tbl) {
+ iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
+ iter->slot = 0;
+ iter->skip = 0;
+ return -EAGAIN;
+ }
+
+ if (iter->p && !rhlist) {
+ /*
+ * We need to validate that 'p' is still in the table, and
+ * if so, update 'skip'
+ */
+ struct rhash_head *p;
+ int skip = 0;
+ rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+ skip++;
+ if (p == iter->p) {
+ iter->skip = skip;
+ goto found;
+ }
+ }
+ iter->p = NULL;
+ } else if (iter->p && rhlist) {
+ /* Need to validate that 'list' is still in the table, and
+ * if so, update 'skip' and 'p'.
+ */
+ struct rhash_head *p;
+ struct rhlist_head *list;
+ int skip = 0;
+ rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+ for (list = container_of(p, struct rhlist_head, rhead);
+ list;
+ list = rcu_dereference(list->next)) {
+ skip++;
+ if (list == iter->list) {
+ iter->p = p;
+ iter->skip = skip;
+ goto found;
+ }
+ }
+ }
+ iter->p = NULL;
+ }
+found:
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
+
+/**
+ * __rhashtable_walk_find_next - Find the next element in a table (or the first
+ * one in case of a new walk).
+ *
+ * @iter: Hash table iterator
+ *
+ * Returns the found object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.
+ */
+static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
+{
+ struct bucket_table *tbl = iter->walker.tbl;
+ struct rhlist_head *list = iter->list;
+ struct rhashtable *ht = iter->ht;
+ struct rhash_head *p = iter->p;
+ bool rhlist = ht->rhlist;
+
+ if (!tbl)
return NULL;
- else if (err == -EAGAIN)
- return tbl;
+
+ for (; iter->slot < tbl->size; iter->slot++) {
+ int skip = iter->skip;
+
+ rht_for_each_rcu(p, tbl, iter->slot) {
+ if (rhlist) {
+ list = container_of(p, struct rhlist_head,
+ rhead);
+ do {
+ if (!skip)
+ goto next;
+ skip--;
+ list = rcu_dereference(list->next);
+ } while (list);
+
+ continue;
+ }
+ if (!skip)
+ break;
+ skip--;
+ }
+
+next:
+ if (!rht_is_a_nulls(p)) {
+ iter->skip++;
+ iter->p = p;
+ iter->list = list;
+ return rht_obj(ht, rhlist ? &list->rhead : p);
+ }
+
+ iter->skip = 0;
+ }
+
+ iter->p = NULL;
+
+ /* Ensure we see any new tables. */
+ smp_rmb();
+
+ iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+ if (iter->walker.tbl) {
+ iter->slot = 0;
+ iter->skip = 0;
+ return ERR_PTR(-EAGAIN);
+ } else {
+ iter->end_of_table = true;
+ }
+
+ return NULL;
+}
+
+/**
+ * rhashtable_walk_next - Return the next object and advance the iterator
+ * @iter: Hash table iterator
+ *
+ * Note that you must call rhashtable_walk_stop when you are finished
+ * with the walk.
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred. Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_next(struct rhashtable_iter *iter)
+{
+ struct rhlist_head *list = iter->list;
+ struct rhashtable *ht = iter->ht;
+ struct rhash_head *p = iter->p;
+ bool rhlist = ht->rhlist;
+
+ if (p) {
+ if (!rhlist || !(list = rcu_dereference(list->next))) {
+ p = rcu_dereference(p->next);
+ list = container_of(p, struct rhlist_head, rhead);
+ }
+ if (!rht_is_a_nulls(p)) {
+ iter->skip++;
+ iter->p = p;
+ iter->list = list;
+ return rht_obj(ht, rhlist ? &list->rhead : p);
+ }
+
+ /* At the end of this slot, switch to next one and then find
+ * next entry from that point.
+ */
+ iter->skip = 0;
+ iter->slot++;
+ }
+
+ return __rhashtable_walk_find_next(iter);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_next);
+
+/**
+ * rhashtable_walk_peek - Return the next object but don't advance the iterator
+ * @iter: Hash table iterator
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred. Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_peek(struct rhashtable_iter *iter)
+{
+ struct rhlist_head *list = iter->list;
+ struct rhashtable *ht = iter->ht;
+ struct rhash_head *p = iter->p;
+
+ if (p)
+ return rht_obj(ht, ht->rhlist ? &list->rhead : p);
+
+ /* No object found in current iter, find next one in the table. */
+
+ if (iter->skip) {
+ /* A nonzero skip value points to the next entry in the table
+ * beyond that last one that was found. Decrement skip so
+ * we find the current value. __rhashtable_walk_find_next
+ * will restore the original value of skip assuming that
+ * the table hasn't changed.
+ */
+ iter->skip--;
+ }
+
+ return __rhashtable_walk_find_next(iter);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
+
+/**
+ * rhashtable_walk_stop - Finish a hash table walk
+ * @iter: Hash table iterator
+ *
+ * Finish a hash table walk. Does not reset the iterator to the start of the
+ * hash table.
+ */
+void rhashtable_walk_stop(struct rhashtable_iter *iter)
+ __releases(RCU)
+{
+ struct rhashtable *ht;
+ struct bucket_table *tbl = iter->walker.tbl;
+
+ if (!tbl)
+ goto out;
+
+ ht = iter->ht;
+
+ spin_lock(&ht->lock);
+ if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
+ /* This bucket table is being freed, don't re-link it. */
+ iter->walker.tbl = NULL;
else
- return ERR_PTR(err);
+ list_add(&iter->walker.list, &tbl->walkers);
+ spin_unlock(&ht->lock);
+
+out:
+ rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
static size_t rounded_hashtable_size(const struct rhashtable_params *params)
{
- return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
- (unsigned long)params->min_size);
+ size_t retsize;
+
+ if (params->nelem_hint)
+ retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
+ (unsigned long)params->min_size);
+ else
+ retsize = max(HASH_DEFAULT_SIZE,
+ (unsigned long)params->min_size);
+
+ return retsize;
}
static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
@@ -431,21 +966,58 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
return jhash2(key, length, seed);
}
+/**
+ * rhashtable_init - initialize a new hash table
+ * @ht: hash table to be initialized
+ * @params: configuration parameters
+ *
+ * Initializes a new hash table based on the provided configuration
+ * parameters. A table can be configured either with a variable or
+ * fixed length key:
+ *
+ * Configuration Example 1: Fixed length keys
+ * struct test_obj {
+ * int key;
+ * void * my_member;
+ * struct rhash_head node;
+ * };
+ *
+ * struct rhashtable_params params = {
+ * .head_offset = offsetof(struct test_obj, node),
+ * .key_offset = offsetof(struct test_obj, key),
+ * .key_len = sizeof(int),
+ * .hashfn = jhash,
+ * };
+ *
+ * Configuration Example 2: Variable length keys
+ * struct test_obj {
+ * [...]
+ * struct rhash_head node;
+ * };
+ *
+ * u32 my_hash_fn(const void *data, u32 len, u32 seed)
+ * {
+ * struct test_obj *obj = data;
+ *
+ * return [... hash ...];
+ * }
+ *
+ * struct rhashtable_params params = {
+ * .head_offset = offsetof(struct test_obj, node),
+ * .hashfn = jhash,
+ * .obj_hashfn = my_hash_fn,
+ * };
+ */
int rhashtable_init(struct rhashtable *ht,
const struct rhashtable_params *params)
{
struct bucket_table *tbl;
size_t size;
- size = HASH_DEFAULT_SIZE;
-
if ((!params->key_len && !params->obj_hashfn) ||
(params->obj_hashfn && !params->obj_cmpfn))
return -EINVAL;
- if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
- return -EINVAL;
-
memset(ht, 0, sizeof(*ht));
mutex_init(&ht->mutex);
spin_lock_init(&ht->lock);
@@ -454,39 +1026,18 @@ int rhashtable_init(struct rhashtable *ht,
if (params->min_size)
ht->p.min_size = roundup_pow_of_two(params->min_size);
- if (params->max_size)
- ht->p.max_size = rounddown_pow_of_two(params->max_size);
+ /* Cap total entries at 2^31 to avoid nelems overflow. */
+ ht->max_elems = 1u << 31;
- if (params->insecure_max_entries)
- ht->p.insecure_max_entries =
- rounddown_pow_of_two(params->insecure_max_entries);
- else
- ht->p.insecure_max_entries = ht->p.max_size * 2;
-
- ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
+ if (params->max_size) {
+ ht->p.max_size = rounddown_pow_of_two(params->max_size);
+ if (ht->p.max_size < ht->max_elems / 2)
+ ht->max_elems = ht->p.max_size * 2;
+ }
- if (params->nelem_hint)
- size = rounded_hashtable_size(&ht->p);
-
- /* The maximum (not average) chain length grows with the
- * size of the hash table, at a rate of (log N)/(log log N).
- * The value of 16 is selected so that even if the hash
- * table grew to 2^32 you would not expect the maximum
- * chain length to exceed it unless we are under attack
- * (or extremely unlucky).
- *
- * As this limit is only to detect attacks, we don't need
- * to set it to a lower value as you'd need the chain
- * length to vastly exceed 16 to have any real effect
- * on the system.
- */
- if (!params->insecure_elasticity)
- ht->elasticity = 16;
+ ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
- if (params->locks_mul)
- ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
- else
- ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
+ size = rounded_hashtable_size(&ht->p);
ht->key_len = ht->p.key_len;
if (!params->hashfn) {
@@ -498,9 +1049,16 @@ int rhashtable_init(struct rhashtable *ht,
}
}
+ /*
+ * This is api initialization and thus we need to guarantee the
+ * initial rhashtable allocation. Upon failure, retry with the
+ * smallest possible size with __GFP_NOFAIL semantics.
+ */
tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
- if (tbl == NULL)
- return -ENOMEM;
+ if (unlikely(tbl == NULL)) {
+ size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
+ tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
+ }
atomic_set(&ht->nelems, 0);
@@ -510,15 +1068,170 @@ int rhashtable_init(struct rhashtable *ht,
return 0;
}
+EXPORT_SYMBOL_GPL(rhashtable_init);
-void rhashtable_destroy(struct rhashtable *ht)
+/**
+ * rhltable_init - initialize a new hash list table
+ * @hlt: hash list table to be initialized
+ * @params: configuration parameters
+ *
+ * Initializes a new hash list table.
+ *
+ * See documentation for rhashtable_init.
+ */
+int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
{
- struct bucket_table *tbl;
+ int err;
+
+ err = rhashtable_init(&hlt->ht, params);
+ hlt->ht.rhlist = true;
+ return err;
+}
+EXPORT_SYMBOL_GPL(rhltable_init);
+
+static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
+ void (*free_fn)(void *ptr, void *arg),
+ void *arg)
+{
+ struct rhlist_head *list;
+
+ if (!ht->rhlist) {
+ free_fn(rht_obj(ht, obj), arg);
+ return;
+ }
+
+ list = container_of(obj, struct rhlist_head, rhead);
+ do {
+ obj = &list->rhead;
+ list = rht_dereference(list->next, ht);
+ free_fn(rht_obj(ht, obj), arg);
+ } while (list);
+}
+
+/**
+ * rhashtable_free_and_destroy - free elements and destroy hash table
+ * @ht: the hash table to destroy
+ * @free_fn: callback to release resources of element
+ * @arg: pointer passed to free_fn
+ *
+ * Stops an eventual async resize. If defined, invokes free_fn for each
+ * element to releasal resources. Please note that RCU protected
+ * readers may still be accessing the elements. Releasing of resources
+ * must occur in a compatible manner. Then frees the bucket array.
+ *
+ * This function will eventually sleep to wait for an async resize
+ * to complete. The caller is responsible that no further write operations
+ * occurs in parallel.
+ */
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+ void (*free_fn)(void *ptr, void *arg),
+ void *arg)
+{
+ struct bucket_table *tbl, *next_tbl;
+ unsigned int i;
cancel_work_sync(&ht->run_work);
mutex_lock(&ht->mutex);
tbl = rht_dereference(ht->tbl, ht);
+restart:
+ if (free_fn) {
+ for (i = 0; i < tbl->size; i++) {
+ struct rhash_head *pos, *next;
+
+ cond_resched();
+ for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
+ next = !rht_is_a_nulls(pos) ?
+ rht_dereference(pos->next, ht) : NULL;
+ !rht_is_a_nulls(pos);
+ pos = next,
+ next = !rht_is_a_nulls(pos) ?
+ rht_dereference(pos->next, ht) : NULL)
+ rhashtable_free_one(ht, pos, free_fn, arg);
+ }
+ }
+
+ next_tbl = rht_dereference(tbl->future_tbl, ht);
bucket_table_free(tbl);
+ if (next_tbl) {
+ tbl = next_tbl;
+ goto restart;
+ }
mutex_unlock(&ht->mutex);
}
+EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
+
+void rhashtable_destroy(struct rhashtable *ht)
+{
+ return rhashtable_free_and_destroy(ht, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(rhashtable_destroy);
+
+struct rhash_lock_head __rcu **__rht_bucket_nested(
+ const struct bucket_table *tbl, unsigned int hash)
+{
+ const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+ unsigned int index = hash & ((1 << tbl->nest) - 1);
+ unsigned int size = tbl->size >> tbl->nest;
+ unsigned int subhash = hash;
+ union nested_table *ntbl;
+
+ ntbl = nested_table_top(tbl);
+ ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
+ subhash >>= tbl->nest;
+
+ while (ntbl && size > (1 << shift)) {
+ index = subhash & ((1 << shift) - 1);
+ ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
+ tbl, hash);
+ size >>= shift;
+ subhash >>= shift;
+ }
+
+ if (!ntbl)
+ return NULL;
+
+ return &ntbl[subhash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(__rht_bucket_nested);
+
+struct rhash_lock_head __rcu **rht_bucket_nested(
+ const struct bucket_table *tbl, unsigned int hash)
+{
+ static struct rhash_lock_head __rcu *rhnull;
+
+ if (!rhnull)
+ INIT_RHT_NULLS_HEAD(rhnull);
+ return __rht_bucket_nested(tbl, hash) ?: &rhnull;
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested);
+
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(
+ struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+ const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+ unsigned int index = hash & ((1 << tbl->nest) - 1);
+ unsigned int size = tbl->size >> tbl->nest;
+ union nested_table *ntbl;
+
+ ntbl = nested_table_top(tbl);
+ hash >>= tbl->nest;
+ ntbl = nested_table_alloc(ht, &ntbl[index].table,
+ size <= (1 << shift));
+
+ while (ntbl && size > (1 << shift)) {
+ index = hash & ((1 << shift) - 1);
+ size >>= shift;
+ hash >>= shift;
+ ntbl = nested_table_alloc(ht, &ntbl[index].table,
+ size <= (1 << shift));
+ }
+
+ if (!ntbl)
+ return NULL;
+
+ return &ntbl[hash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
diff --git a/linux/six.c b/linux/six.c
index 53280044..fe721891 100644
--- a/linux/six.c
+++ b/linux/six.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/six.h>
+#include <linux/slab.h>
#ifdef DEBUG
#define EBUG_ON(cond) BUG_ON(cond)
@@ -309,6 +310,9 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
wake_up_process(p);
}
+ if (ret)
+ six_acquire(&lock->dep_map, 1);
+
return ret;
}
@@ -560,6 +564,7 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
lock->readers) {
smp_mb(); /* unlock barrier */
this_cpu_dec(*lock->readers);
+ smp_mb(); /* between unlocking and checking for waiters */
state.v = READ_ONCE(lock->state.v);
} else {
EBUG_ON(!(lock->state.v & l[type].held_mask));
@@ -705,6 +710,34 @@ void six_lock_wakeup_all(struct six_lock *lock)
}
EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+struct free_pcpu_rcu {
+ struct rcu_head rcu;
+ void __percpu *p;
+};
+
+static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
+{
+ struct free_pcpu_rcu *rcu =
+ container_of(_rcu, struct free_pcpu_rcu, rcu);
+
+ free_percpu(rcu->p);
+ kfree(rcu);
+}
+
+void six_lock_pcpu_free_rcu(struct six_lock *lock)
+{
+ struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
+
+ if (!rcu)
+ return;
+
+ rcu->p = lock->readers;
+ lock->readers = NULL;
+
+ call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
+
void six_lock_pcpu_free(struct six_lock *lock)
{
BUG_ON(lock->readers && pcpu_read_count(lock));
@@ -717,8 +750,6 @@ EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
void six_lock_pcpu_alloc(struct six_lock *lock)
{
- BUG_ON(lock->readers && pcpu_read_count(lock));
- BUG_ON(lock->state.read_lock);
#ifdef __KERNEL__
if (!lock->readers)
lock->readers = alloc_percpu(unsigned);