diff options
33 files changed, 829 insertions, 503 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision index f08feb4a..f52da942 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -9b4ab159abcd84cf0c25ee851dda8c40baffecc8 +5a0455ae19afb354634b3c5c9bf55d2171005a2f diff --git a/include/linux/overflow.h b/include/linux/overflow.h index ba30f77e..0c7e3dcf 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -4,14 +4,12 @@ #include <linux/compiler.h> #include <linux/limits.h> +#include <linux/const.h> /* - * In the fallback code below, we need to compute the minimum and - * maximum values representable in a given type. These macros may also - * be useful elsewhere, so we provide them outside the - * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block. - * - * It would seem more obvious to do something like + * We need to compute the minimum and maximum values representable in a given + * type. These macros may also be useful elsewhere. It would seem more obvious + * to do something like: * * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) @@ -33,8 +31,10 @@ * credit to Christian Biere. */ #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) -#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) -#define type_min(T) ((T)((T)-type_max(T)-(T)1)) +#define __type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) +#define type_max(t) __type_max(typeof(t)) +#define __type_min(T) ((T)((T)-type_max(T)-(T)1)) +#define type_min(t) __type_min(typeof(t)) /* * Avoids triggering -Wtype-limits compilation warning, @@ -53,194 +53,153 @@ static inline bool __must_check __must_check_overflow(bool overflow) return unlikely(overflow); } -#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW -/* - * For simplicity and code hygiene, the fallback code below insists on - * a, b and *d having the same type (similar to the min() and max() - * macros), whereas gcc's type-generic overflow checkers accept - * different types. Hence we don't just make check_add_overflow an - * alias for __builtin_add_overflow, but add type checks similar to - * below. +/** + * check_add_overflow() - Calculate addition with overflow checking + * @a: first addend + * @b: second addend + * @d: pointer to store sum + * + * Returns true on wrap-around, false otherwise. + * + * *@d holds the results of the attempted addition, regardless of whether + * wrap-around occurred. */ -#define check_add_overflow(a, b, d) __must_check_overflow(({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - __builtin_add_overflow(__a, __b, __d); \ -})) - -#define check_sub_overflow(a, b, d) __must_check_overflow(({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - __builtin_sub_overflow(__a, __b, __d); \ -})) - -#define check_mul_overflow(a, b, d) __must_check_overflow(({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - __builtin_mul_overflow(__a, __b, __d); \ -})) - -#else - +#define check_add_overflow(a, b, d) \ + __must_check_overflow(__builtin_add_overflow(a, b, d)) -/* Checking for unsigned overflow is relatively easy without causing UB. */ -#define __unsigned_add_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a + __b; \ - *__d < __a; \ -}) -#define __unsigned_sub_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a - __b; \ - __a < __b; \ -}) -/* - * If one of a or b is a compile-time constant, this avoids a division. +/** + * wrapping_add() - Intentionally perform a wrapping addition + * @type: type for result of calculation + * @a: first addend + * @b: second addend + * + * Return the potentially wrapped-around addition without + * tripping any wrap-around sanitizers that may be enabled. */ -#define __unsigned_mul_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a * __b; \ - __builtin_constant_p(__b) ? \ - __b > 0 && __a > type_max(typeof(__a)) / __b : \ - __a > 0 && __b > type_max(typeof(__b)) / __a; \ -}) +#define wrapping_add(type, a, b) \ + ({ \ + type __val; \ + __builtin_add_overflow(a, b, &__val); \ + __val; \ + }) -/* - * For signed types, detecting overflow is much harder, especially if - * we want to avoid UB. But the interface of these macros is such that - * we must provide a result in *d, and in fact we must produce the - * result promised by gcc's builtins, which is simply the possibly - * wrapped-around value. Fortunately, we can just formally do the - * operations in the widest relevant unsigned type (u64) and then - * truncate the result - gcc is smart enough to generate the same code - * with and without the (u64) casts. +/** + * wrapping_assign_add() - Intentionally perform a wrapping increment assignment + * @var: variable to be incremented + * @offset: amount to add + * + * Increments @var by @offset with wrap-around. Returns the resulting + * value of @var. Will not trip any wrap-around sanitizers. + * + * Returns the new value of @var. */ +#define wrapping_assign_add(var, offset) \ + ({ \ + typeof(var) *__ptr = &(var); \ + *__ptr = wrapping_add(typeof(var), *__ptr, offset); \ + }) -/* - * Adding two signed integers can overflow only if they have the same - * sign, and overflow has happened iff the result has the opposite - * sign. +/** + * check_sub_overflow() - Calculate subtraction with overflow checking + * @a: minuend; value to subtract from + * @b: subtrahend; value to subtract from @a + * @d: pointer to store difference + * + * Returns true on wrap-around, false otherwise. + * + * *@d holds the results of the attempted subtraction, regardless of whether + * wrap-around occurred. */ -#define __signed_add_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a + (u64)__b; \ - (((~(__a ^ __b)) & (*__d ^ __a)) \ - & type_min(typeof(__a))) != 0; \ -}) +#define check_sub_overflow(a, b, d) \ + __must_check_overflow(__builtin_sub_overflow(a, b, d)) -/* - * Subtraction is similar, except that overflow can now happen only - * when the signs are opposite. In this case, overflow has happened if - * the result has the opposite sign of a. +/** + * wrapping_sub() - Intentionally perform a wrapping subtraction + * @type: type for result of calculation + * @a: minuend; value to subtract from + * @b: subtrahend; value to subtract from @a + * + * Return the potentially wrapped-around subtraction without + * tripping any wrap-around sanitizers that may be enabled. */ -#define __signed_sub_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a - (u64)__b; \ - ((((__a ^ __b)) & (*__d ^ __a)) \ - & type_min(typeof(__a))) != 0; \ -}) +#define wrapping_sub(type, a, b) \ + ({ \ + type __val; \ + __builtin_sub_overflow(a, b, &__val); \ + __val; \ + }) -/* - * Signed multiplication is rather hard. gcc always follows C99, so - * division is truncated towards 0. This means that we can write the - * overflow check like this: - * - * (a > 0 && (b > MAX/a || b < MIN/a)) || - * (a < -1 && (b > MIN/a || b < MAX/a) || - * (a == -1 && b == MIN) - * - * The redundant casts of -1 are to silence an annoying -Wtype-limits - * (included in -Wextra) warning: When the type is u8 or u16, the - * __b_c_e in check_mul_overflow obviously selects - * __unsigned_mul_overflow, but unfortunately gcc still parses this - * code and warns about the limited range of __b. +/** + * wrapping_assign_sub() - Intentionally perform a wrapping decrement assign + * @var: variable to be decremented + * @offset: amount to subtract + * + * Decrements @var by @offset with wrap-around. Returns the resulting + * value of @var. Will not trip any wrap-around sanitizers. + * + * Returns the new value of @var. */ +#define wrapping_assign_sub(var, offset) \ + ({ \ + typeof(var) *__ptr = &(var); \ + *__ptr = wrapping_sub(typeof(var), *__ptr, offset); \ + }) -#define __signed_mul_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - typeof(a) __tmax = type_max(typeof(a)); \ - typeof(a) __tmin = type_min(typeof(a)); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a * (u64)__b; \ - (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ - (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ - (__b == (typeof(__b))-1 && __a == __tmin); \ -}) - - -#define check_add_overflow(a, b, d) __must_check_overflow( \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_add_overflow(a, b, d), \ - __unsigned_add_overflow(a, b, d))) - -#define check_sub_overflow(a, b, d) __must_check_overflow( \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_sub_overflow(a, b, d), \ - __unsigned_sub_overflow(a, b, d))) - -#define check_mul_overflow(a, b, d) __must_check_overflow( \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_mul_overflow(a, b, d), \ - __unsigned_mul_overflow(a, b, d))) - -#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ +/** + * check_mul_overflow() - Calculate multiplication with overflow checking + * @a: first factor + * @b: second factor + * @d: pointer to store product + * + * Returns true on wrap-around, false otherwise. + * + * *@d holds the results of the attempted multiplication, regardless of whether + * wrap-around occurred. + */ +#define check_mul_overflow(a, b, d) \ + __must_check_overflow(__builtin_mul_overflow(a, b, d)) -/** check_shl_overflow() - Calculate a left-shifted value and check overflow +/** + * wrapping_mul() - Intentionally perform a wrapping multiplication + * @type: type for result of calculation + * @a: first factor + * @b: second factor * + * Return the potentially wrapped-around multiplication without + * tripping any wrap-around sanitizers that may be enabled. + */ +#define wrapping_mul(type, a, b) \ + ({ \ + type __val; \ + __builtin_mul_overflow(a, b, &__val); \ + __val; \ + }) + +/** + * check_shl_overflow() - Calculate a left-shifted value and check overflow * @a: Value to be shifted * @s: How many bits left to shift * @d: Pointer to where to store the result * * Computes *@d = (@a << @s) * - * Returns true if '*d' cannot hold the result or when 'a << s' doesn't + * Returns true if '*@d' cannot hold the result or when '@a << @s' doesn't * make sense. Example conditions: - * - 'a << s' causes bits to be lost when stored in *d. - * - 's' is garbage (e.g. negative) or so large that the result of - * 'a << s' is guaranteed to be 0. - * - 'a' is negative. - * - 'a << s' sets the sign bit, if any, in '*d'. - * - * '*d' will hold the results of the attempted shift, but is not - * considered "safe for use" if false is returned. + * + * - '@a << @s' causes bits to be lost when stored in *@d. + * - '@s' is garbage (e.g. negative) or so large that the result of + * '@a << @s' is guaranteed to be 0. + * - '@a' is negative. + * - '@a << @s' sets the sign bit, if any, in '*@d'. + * + * '*@d' will hold the results of the attempted shift, but is not + * considered "safe for use" if true is returned. */ #define check_shl_overflow(a, s, d) __must_check_overflow(({ \ typeof(a) _a = a; \ typeof(s) _s = s; \ typeof(d) _d = d; \ - u64 _a_full = _a; \ + unsigned long long _a_full = _a; \ unsigned int _to_shift = \ is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \ *_d = (_a_full << _to_shift); \ @@ -248,87 +207,141 @@ static inline bool __must_check __must_check_overflow(bool overflow) (*_d >> _to_shift) != _a); \ })) +#define __overflows_type_constexpr(x, T) ( \ + is_unsigned_type(typeof(x)) ? \ + (x) > type_max(T) : \ + is_unsigned_type(typeof(T)) ? \ + (x) < 0 || (x) > type_max(T) : \ + (x) < type_min(T) || (x) > type_max(T)) + +#define __overflows_type(x, T) ({ \ + typeof(T) v = 0; \ + check_add_overflow((x), v, &v); \ +}) + /** - * array_size() - Calculate size of 2-dimensional array. + * overflows_type - helper for checking the overflows between value, variables, + * or data type * - * @a: dimension one - * @b: dimension two + * @n: source constant value or variable to be checked + * @T: destination variable or data type proposed to store @x * - * Calculates size of 2-dimensional array: @a * @b. + * Compares the @x expression for whether or not it can safely fit in + * the storage of the type in @T. @x and @T can have different types. + * If @x is a constant expression, this will also resolve to a constant + * expression. * - * Returns: number of bytes needed to represent the array or SIZE_MAX on - * overflow. + * Returns: true if overflow can occur, false otherwise. + */ +#define overflows_type(n, T) \ + __builtin_choose_expr(__is_constexpr(n), \ + __overflows_type_constexpr(n, T), \ + __overflows_type(n, T)) + +/** + * castable_to_type - like __same_type(), but also allows for casted literals + * + * @n: variable or constant value + * @T: variable or data type + * + * Unlike the __same_type() macro, this allows a constant value as the + * first argument. If this value would not overflow into an assignment + * of the second argument's type, it returns true. Otherwise, this falls + * back to __same_type(). */ -static inline __must_check size_t array_size(size_t a, size_t b) +#define castable_to_type(n, T) \ + __builtin_choose_expr(__is_constexpr(n), \ + !__overflows_type_constexpr(n, T), \ + __same_type(n, T)) + +/** + * size_mul() - Calculate size_t multiplication with saturation at SIZE_MAX + * @factor1: first factor + * @factor2: second factor + * + * Returns: calculate @factor1 * @factor2, both promoted to size_t, + * with any overflow causing the return value to be SIZE_MAX. The + * lvalue must be size_t to avoid implicit type conversion. + */ +static inline size_t __must_check size_mul(size_t factor1, size_t factor2) { size_t bytes; - if (check_mul_overflow(a, b, &bytes)) + if (check_mul_overflow(factor1, factor2, &bytes)) return SIZE_MAX; return bytes; } /** - * array3_size() - Calculate size of 3-dimensional array. + * size_add() - Calculate size_t addition with saturation at SIZE_MAX + * @addend1: first addend + * @addend2: second addend * - * @a: dimension one - * @b: dimension two - * @c: dimension three - * - * Calculates size of 3-dimensional array: @a * @b * @c. - * - * Returns: number of bytes needed to represent the array or SIZE_MAX on - * overflow. + * Returns: calculate @addend1 + @addend2, both promoted to size_t, + * with any overflow causing the return value to be SIZE_MAX. The + * lvalue must be size_t to avoid implicit type conversion. */ -static inline __must_check size_t array3_size(size_t a, size_t b, size_t c) +static inline size_t __must_check size_add(size_t addend1, size_t addend2) { size_t bytes; - if (check_mul_overflow(a, b, &bytes)) - return SIZE_MAX; - if (check_mul_overflow(bytes, c, &bytes)) + if (check_add_overflow(addend1, addend2, &bytes)) return SIZE_MAX; return bytes; } -/* - * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for - * struct_size() below. +/** + * size_sub() - Calculate size_t subtraction with saturation at SIZE_MAX + * @minuend: value to subtract from + * @subtrahend: value to subtract from @minuend + * + * Returns: calculate @minuend - @subtrahend, both promoted to size_t, + * with any overflow causing the return value to be SIZE_MAX. For + * composition with the size_add() and size_mul() helpers, neither + * argument may be SIZE_MAX (or the result with be forced to SIZE_MAX). + * The lvalue must be size_t to avoid implicit type conversion. */ -static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c) +static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) { size_t bytes; - if (check_mul_overflow(a, b, &bytes)) - return SIZE_MAX; - if (check_add_overflow(bytes, c, &bytes)) + if (minuend == SIZE_MAX || subtrahend == SIZE_MAX || + check_sub_overflow(minuend, subtrahend, &bytes)) return SIZE_MAX; return bytes; } /** - * struct_size() - Calculate size of structure with trailing array. - * @p: Pointer to the structure. - * @member: Name of the array member. - * @count: Number of elements in the array. + * array_size() - Calculate size of 2-dimensional array. + * @a: dimension one + * @b: dimension two * - * Calculates size of memory needed for structure @p followed by an - * array of @count number of @member elements. + * Calculates size of 2-dimensional array: @a * @b. * - * Return: number of bytes needed or SIZE_MAX on overflow. + * Returns: number of bytes needed to represent the array or SIZE_MAX on + * overflow. */ -#define struct_size(p, member, count) \ - __ab_c_size(count, \ - sizeof(*(p)->member) + __must_be_array((p)->member),\ - sizeof(*(p))) +#define array_size(a, b) size_mul(a, b) + +/** + * array3_size() - Calculate size of 3-dimensional array. + * @a: dimension one + * @b: dimension two + * @c: dimension three + * + * Calculates size of 3-dimensional array: @a * @b * @c. + * + * Returns: number of bytes needed to represent the array or SIZE_MAX on + * overflow. + */ +#define array3_size(a, b, c) size_mul(size_mul(a, b), c) /** * flex_array_size() - Calculate size of a flexible array member * within an enclosing structure. - * * @p: Pointer to the structure. * @member: Name of the flexible array member. * @count: Number of elements in the array. @@ -339,7 +352,92 @@ static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c) * Return: number of bytes needed or SIZE_MAX on overflow. */ #define flex_array_size(p, member, count) \ - array_size(count, \ - sizeof(*(p)->member) + __must_be_array((p)->member)) + __builtin_choose_expr(__is_constexpr(count), \ + (count) * sizeof(*(p)->member) + __must_be_array((p)->member), \ + size_mul(count, sizeof(*(p)->member) + __must_be_array((p)->member))) + +/** + * struct_size() - Calculate size of structure with trailing flexible array. + * @p: Pointer to the structure. + * @member: Name of the array member. + * @count: Number of elements in the array. + * + * Calculates size of memory needed for structure of @p followed by an + * array of @count number of @member elements. + * + * Return: number of bytes needed or SIZE_MAX on overflow. + */ +#define struct_size(p, member, count) \ + __builtin_choose_expr(__is_constexpr(count), \ + sizeof(*(p)) + flex_array_size(p, member, count), \ + size_add(sizeof(*(p)), flex_array_size(p, member, count))) + +/** + * struct_size_t() - Calculate size of structure with trailing flexible array + * @type: structure type name. + * @member: Name of the array member. + * @count: Number of elements in the array. + * + * Calculates size of memory needed for structure @type followed by an + * array of @count number of @member elements. Prefer using struct_size() + * when possible instead, to keep calculations associated with a specific + * instance variable of type @type. + * + * Return: number of bytes needed or SIZE_MAX on overflow. + */ +#define struct_size_t(type, member, count) \ + struct_size((type *)NULL, member, count) + +/** + * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. + * Enables caller macro to pass (different) initializer. + * + * @type: structure type name, including "struct" keyword. + * @name: Name for a variable to define. + * @member: Name of the array member. + * @count: Number of elements in the array; must be compile-time const. + * @initializer: initializer expression (could be empty for no init). + */ +#define _DEFINE_FLEX(type, name, member, count, initializer...) \ + _Static_assert(__builtin_constant_p(count), \ + "onstack flex array members require compile-time const count"); \ + union { \ + u8 bytes[struct_size_t(type, member, count)]; \ + type obj; \ + } name##_u initializer; \ + type *name = (type *)&name##_u + +/** + * DEFINE_RAW_FLEX() - Define an on-stack instance of structure with a trailing + * flexible array member, when it does not have a __counted_by annotation. + * + * @type: structure type name, including "struct" keyword. + * @name: Name for a variable to define. + * @member: Name of the array member. + * @count: Number of elements in the array; must be compile-time const. + * + * Define a zeroed, on-stack, instance of @type structure with a trailing + * flexible array member. + * Use __struct_size(@name) to get compile-time size of it afterwards. + */ +#define DEFINE_RAW_FLEX(type, name, member, count) \ + _DEFINE_FLEX(type, name, member, count, = {}) + +/** + * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing + * flexible array member. + * + * @TYPE: structure type name, including "struct" keyword. + * @NAME: Name for a variable to define. + * @MEMBER: Name of the array member. + * @COUNTER: Name of the __counted_by member. + * @COUNT: Number of elements in the array; must be compile-time const. + * + * Define a zeroed, on-stack, instance of @TYPE structure with a trailing + * flexible array member. + * Use __struct_size(@NAME) to get compile-time size of it afterwards. + */ +#define DEFINE_FLEX(TYPE, NAME, MEMBER, COUNTER, COUNT) \ + _DEFINE_FLEX(TYPE, NAME, MEMBER, COUNT, = { .obj.COUNTER = COUNT, }) #endif /* __LINUX_OVERFLOW_H */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 5fb8b0f4..cd35d1cf 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -216,6 +216,7 @@ #include "recovery_passes_types.h" #include "sb-errors_types.h" #include "seqmutex.h" +#include "snapshot_types.h" #include "time_stats.h" #include "util.h" @@ -709,7 +710,7 @@ struct btree_transaction_stats { unsigned nr_max_paths; unsigned journal_entries_size; unsigned max_mem; -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE darray_trans_kmalloc_trace trans_kmalloc_trace; #endif char *max_paths_text; @@ -869,7 +870,7 @@ struct bch_fs { struct mutex snapshot_table_lock; struct rw_semaphore snapshot_create_lock; - struct work_struct snapshot_delete_work; + struct snapshot_delete snapshot_delete; struct work_struct snapshot_wait_for_pagecache_and_delete_work; snapshot_id_list snapshots_unlinked; struct mutex snapshots_unlinked_lock; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 7ce475c5..0beff6af 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -695,7 +695,8 @@ struct bch_sb_field_ext { x(stripe_backpointers, BCH_VERSION(1, 22)) \ x(stripe_lru, BCH_VERSION(1, 23)) \ x(casefolding, BCH_VERSION(1, 24)) \ - x(extent_flags, BCH_VERSION(1, 25)) + x(extent_flags, BCH_VERSION(1, 25)) \ + x(snapshot_deletion_v2, BCH_VERSION(1, 26)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 9d941619..9c088d94 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -3089,7 +3089,7 @@ void bch2_trans_copy_iter(struct btree_trans *trans, dst->key_cache_path = 0; } -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, darray_trans_kmalloc_trace *trace) { @@ -3112,7 +3112,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long void *p; if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE struct printbuf buf = PRINTBUF; bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); bch2_print_str(c, KERN_ERR, buf.buf); @@ -3127,7 +3127,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long struct btree_transaction_stats *s = btree_trans_stats(trans); if (new_bytes > s->max_mem) { mutex_lock(&s->lock); -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, trans->trans_kmalloc_trace.nr); @@ -3314,7 +3314,7 @@ u32 bch2_trans_begin(struct btree_trans *trans) } #endif -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE trans->trans_kmalloc_trace.nr = 0; #endif @@ -3486,6 +3486,8 @@ void bch2_trans_put(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); +#endif +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE darray_exit(&trans->trans_kmalloc_trace); #endif @@ -3642,7 +3644,7 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE darray_exit(&s->trans_kmalloc_trace); #endif kfree(s->max_paths_text); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 7d00d2ff..78a805a8 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -543,7 +543,7 @@ void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btre void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE void bch2_trans_kmalloc_trace_to_text(struct printbuf *, darray_trans_kmalloc_trace *); #endif @@ -553,7 +553,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long); static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size, unsigned long ip) { -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE darray_push(&trans->trans_kmalloc_trace, ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size })); #endif diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 9e6e47d5..3acccca3 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -495,7 +495,7 @@ struct btree_trans { void *mem; unsigned mem_top; unsigned mem_bytes; -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE darray_trans_kmalloc_trace trans_kmalloc_trace; #endif diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 8b8800ad..4ee5d486 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -760,7 +760,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, mutex_lock(&s->lock); prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE printbuf_indent_add(&i->buf, 2); bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); printbuf_indent_sub(&i->buf, 2); diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index b012b9dd..ba4de071 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -692,7 +692,7 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv vfs_d_type(d.v->d_type)); if (ret) ctx->pos = d.k->p.offset + 1; - return ret; + return !ret; } int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) @@ -717,7 +717,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) if (ret2 > 0) continue; - ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target)); + ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target)); }))); bch2_bkey_buf_exit(&sk, c); diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c index 7be71952..2dcccf1e 100644 --- a/libbcachefs/disk_accounting.c +++ b/libbcachefs/disk_accounting.c @@ -287,7 +287,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) { - struct bch_replicas_padded r; + union bch_replicas_padded r; return accounting_to_replicas(&r.e, p) ? bch2_mark_replicas(c, &r.e) : 0; @@ -361,7 +361,7 @@ err: int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, enum bch_accounting_mode mode) { - struct bch_replicas_padded r; + union bch_replicas_padded r; if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && @@ -425,10 +425,12 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) percpu_down_read(&c->mark_lock); darray_for_each(acc->k, i) { - struct { + union { + u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs, + BCH_BKEY_PTRS_MAX)]; struct bch_replicas_usage r; - u8 pad[BCH_BKEY_PTRS_MAX]; } u; + u.r.r.nr_devs = BCH_BKEY_PTRS_MAX; if (!accounting_to_replicas(&u.r.r, i->pos)) continue; @@ -627,7 +629,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, switch (acc->type) { case BCH_DISK_ACCOUNTING_replicas: { - struct bch_replicas_padded r; + union bch_replicas_padded r; __accounting_to_replicas(&r.e, acc); for (unsigned i = 0; i < r.e.nr_devs; i++) diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index 79688902..c1a2a957 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -86,35 +86,6 @@ err: return ret; } -void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) -{ - out->atomic++; - rcu_read_lock(); - - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - if (!g) - goto out; - - for (unsigned i = 0; i < g->nr; i++) { - if (i) - prt_printf(out, " "); - - if (g->entries[i].deleted) { - prt_printf(out, "[deleted]"); - continue; - } - - prt_printf(out, "[parent %d devs", g->entries[i].parent); - for_each_member_device_rcu(c, ca, &g->entries[i].devs) - prt_printf(out, " %s", ca->name); - prt_printf(out, "]"); - } - -out: - rcu_read_unlock(); - out->atomic--; -} - static void bch2_sb_disk_groups_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) @@ -241,17 +212,14 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) case TARGET_DEV: return dev == t.dev; case TARGET_GROUP: { - struct bch_disk_groups_cpu *g; - const struct bch_devs_mask *m; - bool ret; - rcu_read_lock(); - g = rcu_dereference(c->disk_groups); - m = g && t.group < g->nr && !g->entries[t.group].deleted + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + const struct bch_devs_mask *m = + g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; - ret = m ? test_bit(dev, m->d) : false; + bool ret = m ? test_bit(dev, m->d) : false; rcu_read_unlock(); return ret; @@ -377,54 +345,81 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) return v; } -void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g, + unsigned v) { - struct bch_disk_groups_cpu *groups; - struct bch_disk_group_cpu *g; - unsigned nr = 0; u16 path[32]; - - out->atomic++; - rcu_read_lock(); - groups = rcu_dereference(c->disk_groups); - if (!groups) - goto invalid; + unsigned nr = 0; while (1) { if (nr == ARRAY_SIZE(path)) goto invalid; - if (v >= groups->nr) + if (v >= (g ? g->nr : 0)) goto invalid; - g = groups->entries + v; + struct bch_disk_group_cpu *e = g->entries + v; - if (g->deleted) + if (e->deleted) goto invalid; path[nr++] = v; - if (!g->parent) + if (!e->parent) break; - v = g->parent - 1; + v = e->parent - 1; } while (nr) { - v = path[--nr]; - g = groups->entries + v; + struct bch_disk_group_cpu *e = g->entries + path[--nr]; - prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); + prt_printf(out, "%.*s", (int) sizeof(e->label), e->label); if (nr) prt_printf(out, "."); } -out: - rcu_read_unlock(); - out->atomic--; return; invalid: prt_printf(out, "invalid label %u", v); - goto out; +} + +void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) +{ + bch2_printbuf_make_room(out, 4096); + + out->atomic++; + rcu_read_lock(); + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + + for (unsigned i = 0; i < (g ? g->nr : 0); i++) { + prt_printf(out, "%2u: ", i); + + if (g->entries[i].deleted) { + prt_printf(out, "[deleted]"); + goto next; + } + + __bch2_disk_path_to_text(out, g, i); + + prt_printf(out, " devs"); + + for_each_member_device_rcu(c, ca, &g->entries[i].devs) + prt_printf(out, " %s", ca->name); +next: + prt_newline(out); + } + + rcu_read_unlock(); + out->atomic--; +} + +void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +{ + out->atomic++; + rcu_read_lock(); + __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v), + rcu_read_unlock(); + --out->atomic; } void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) @@ -470,23 +465,22 @@ inval: int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) { - struct bch_member *mi; - int ret, v = -1; + lockdep_assert_held(&c->sb_lock); - if (!strlen(name) || !strcmp(name, "none")) - return 0; - v = bch2_disk_path_find_or_create(&c->disk_sb, name); - if (v < 0) - return v; + if (!strlen(name) || !strcmp(name, "none")) { + struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_GROUP(mi, 0); + } else { + int v = bch2_disk_path_find_or_create(&c->disk_sb, name); + if (v < 0) + return v; - ret = bch2_sb_disk_groups_to_cpu(c); - if (ret) - return ret; + struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_GROUP(mi, v + 1); + } - mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_GROUP(mi, v + 1); - return 0; + return bch2_sb_disk_groups_to_cpu(c); } int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 1dde0a03..417f89dd 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -2223,10 +2223,10 @@ void bch2_fs_ec_stop(struct bch_fs *c) static bool bch2_fs_ec_flush_done(struct bch_fs *c) { - bool ret; + sched_annotate_sleep(); mutex_lock(&c->ec_stripe_new_lock); - ret = list_empty(&c->ec_stripe_new_list); + bool ret = list_empty(&c->ec_stripe_new_list); mutex_unlock(&c->ec_stripe_new_lock); return ret; diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h index 06144bfd..809446c7 100644 --- a/libbcachefs/ec_types.h +++ b/libbcachefs/ec_types.h @@ -4,9 +4,10 @@ #include "bcachefs_format.h" -struct bch_replicas_padded { +union bch_replicas_padded { + u8 bytes[struct_size_t(struct bch_replicas_entry_v1, + devs, BCH_BKEY_PTRS_MAX)]; struct bch_replicas_entry_v1 e; - u8 pad[BCH_BKEY_PTRS_MAX]; }; struct stripe { @@ -28,7 +29,7 @@ struct gc_stripe { u16 block_sectors[BCH_BKEY_PTRS_MAX]; struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; - struct bch_replicas_padded r; + union bch_replicas_padded r; }; #endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 801e9cd6..b1e9ee28 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -147,10 +147,24 @@ int __must_check bch2_write_inode_size(struct bch_fs *c, void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, struct quota_res *quota_res, s64 sectors) { - bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, - "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, sectors, - inode->ei_inode.bi_sectors); + if (unlikely((s64) inode->v.i_blocks + sectors < 0)) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, sectors, + inode->ei_inode.bi_sectors); + + bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + + if (sectors < 0) + sectors = -inode->v.i_blocks; + else + sectors = 0; + } + inode->v.i_blocks += sectors; #ifdef CONFIG_BCACHEFS_QUOTA @@ -244,7 +258,6 @@ out: if (!ret) ret = err; - bch_err_fn(c, ret); return ret; } @@ -506,11 +519,20 @@ int bchfs_truncate(struct mnt_idmap *idmap, goto err; } - bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && - !bch2_journal_error(&c->journal), c, - "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, - inode->ei_inode.bi_sectors); + if (unlikely(!inode->v.i_size && inode->v.i_blocks && + !bch2_journal_error(&c->journal))) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, + "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, + inode->ei_inode.bi_sectors); + + bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } ret = bch2_setattr_nonsize(idmap, inode, iattr); err: diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 7d3dd1a0..dd88113a 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -785,12 +785,11 @@ static int ref_visible2(struct bch_fs *c, #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ - (_i)->snapshot <= (_snapshot); _i++) \ - if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ + if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) struct inode_walker_entry { struct bch_inode_unpacked inode; - u32 snapshot; u64 count; u64 i_size; }; @@ -824,7 +823,6 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, return bch2_inode_unpack(inode, &u) ?: darray_push(&w->inodes, ((struct inode_walker_entry) { .inode = u, - .snapshot = inode.k->p.snapshot, })); } @@ -864,47 +862,45 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, } static struct inode_walker_entry * -lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) +lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) { - bool is_whiteout = k.k->type == KEY_TYPE_whiteout; + struct bch_fs *c = trans->c; struct inode_walker_entry *i; __darray_for_each(w->inodes, i) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)) goto found; return NULL; found: - BUG_ON(k.k->p.snapshot > i->snapshot); - - if (k.k->p.snapshot != i->snapshot && !is_whiteout) { - struct inode_walker_entry new = *i; - - new.snapshot = k.k->p.snapshot; - new.count = 0; - new.i_size = 0; + BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot); - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); + struct printbuf buf = PRINTBUF; + int ret = 0; - bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" + if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, + trans, snapshot_key_missing_inode_snapshot, + "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" "unexpected because we should always update the inode when we update a key in that inode\n" "%s", - w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); - printbuf_exit(&buf); - - while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) - --i; + w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, + (bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { + struct bch_inode_unpacked new = i->inode; - size_t pos = i - w->inodes.data; - int ret = darray_insert_item(&w->inodes, pos, new); - if (ret) - return ERR_PTR(ret); + new.bi_snapshot = k.k->p.snapshot; - i = w->inodes.data + pos; + ret = __bch2_fsck_write_inode(trans, &new) ?: + bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto fsck_err; } + printbuf_exit(&buf); return i; +fsck_err: + printbuf_exit(&buf); + return ERR_PTR(ret); } static struct inode_walker_entry *walk_inode(struct btree_trans *trans, @@ -919,7 +915,7 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, w->last_pos = k.k->p; - return lookup_inode_for_snapshot(trans->c, w, k); + return lookup_inode_for_snapshot(trans, w, k); } static int get_visible_inodes(struct btree_trans *trans, @@ -1496,21 +1492,21 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal if (i->inode.bi_sectors == i->count) continue; - count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); + count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); if (w->recalculate_sums) i->count = count2; if (i->count != count2) { bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); i->count = count2; } if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), trans, inode_i_sectors_wrong, "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", - w->last_pos.inode, i->snapshot, + w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; ret = bch2_fsck_write_inode(trans, &i->inode); @@ -1821,20 +1817,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + if (i->inode.bi_snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), trans, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n%s", - i->inode.bi_inum, i->snapshot, i->inode.bi_size, + i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { struct btree_iter iter2; bch2_trans_copy_iter(trans, &iter2, iter); - bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot); + bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot); ret = bch2_btree_iter_traverse(trans, &iter2) ?: bch2_btree_delete_at(trans, &iter2, BTREE_UPDATE_internal_snapshot_node); @@ -1856,8 +1852,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + if (i->inode.bi_snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; i->count += k.k->size; @@ -1939,13 +1935,13 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ if (i->inode.bi_nlink == i->count) continue; - count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); + count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); if (count2 < 0) return count2; if (i->count != count2) { bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); i->count = count2; if (i->inode.bi_nlink == i->count) continue; @@ -1954,7 +1950,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ if (fsck_err_on(i->inode.bi_nlink != i->count, trans, inode_dir_wrong_nlink, "directory %llu:%u with wrong i_nlink: got %u, should be %llu", - w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { + w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; ret = bch2_fsck_write_inode(trans, &i->inode); if (ret) diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index 73a0a42a..885c5f71 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -1172,8 +1172,6 @@ retry_pick: bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); rbio->bounce = true; - - async_object_list_add(c, rbio, rbio, &rbio->list_idx); } else if (flags & BCH_READ_must_clone) { /* * Have to clone if there were any splits, due to error @@ -1187,8 +1185,6 @@ retry_pick: &c->bio_read_split), orig); rbio->bio.bi_iter = iter; - - async_object_list_add(c, rbio, rbio, &rbio->list_idx); } else { rbio = orig; rbio->bio.bi_iter = iter; @@ -1219,6 +1215,8 @@ retry_pick: rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; + async_object_list_add(c, rbio, rbio, &rbio->list_idx); + /* XXX: also nvme read recovery level */ if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev))) rbio->bio.bi_opf |= REQ_FUA; diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index f7c5fcbf..399df8fe 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -256,10 +256,35 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, } if (i_sectors_delta) { + s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors); + if (unlikely(bi_sectors + i_sectors_delta < 0)) { + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0", + extent_iter->pos.inode, bi_sectors, i_sectors_delta); + + bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + + if (i_sectors_delta < 0) + i_sectors_delta = -bi_sectors; + else + i_sectors_delta = 0; + } + le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); inode_update_flags = 0; } + /* + * extents, dirents and xattrs updates require that an inode update also + * happens - to ensure that if a key exists in one of those btrees with + * a given snapshot ID an inode is also present - so we may have to skip + * the nojournal optimization: + */ if (inode->k.p.snapshot != iter.snapshot) { inode->k.p.snapshot = iter.snapshot; inode_update_flags = 0; diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 58e3983d..55b76a4d 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1404,7 +1404,7 @@ int bch2_journal_read(struct bch_fs *c, } genradix_for_each(&c->journal_entries, radix_iter, _i) { - struct bch_replicas_padded replicas = { + union bch_replicas_padded replicas = { .e.data_type = BCH_DATA_journal, .e.nr_devs = 0, .e.nr_required = 1, @@ -1632,7 +1632,7 @@ static CLOSURE_CALLBACK(journal_write_done) closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_replicas_padded replicas; + union bch_replicas_padded replicas; u64 seq = le64_to_cpu(w->data->seq); int err = 0; @@ -1784,7 +1784,7 @@ static CLOSURE_CALLBACK(journal_write_submit) BCH_DEV_WRITE_REF_journal_write); if (!ca) { /* XXX: fix this */ - bch_err(c, "missing device for journal write\n"); + bch_err(c, "missing device %u for journal write", ptr->dev); continue; } @@ -2055,7 +2055,7 @@ CLOSURE_CALLBACK(bch2_journal_write) closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_replicas_padded replicas; + union bch_replicas_padded replicas; unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_journal]); int ret; diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index c3a32403..17beba0e 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -955,7 +955,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) seq = 0; spin_lock(&j->lock); while (!ret) { - struct bch_replicas_padded replicas; + union bch_replicas_padded replicas; seq = max(seq, journal_last_seq(j)); if (seq >= j->pin.back) diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index 98c89852..448326c0 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -209,6 +209,7 @@ enum bch_fsck_flags { x(subvol_to_missing_root, 188, 0) \ x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \ x(bkey_in_missing_snapshot, 190, 0) \ + x(bkey_in_deleted_snapshot, 315, 0) \ x(inode_pos_inode_nonzero, 191, 0) \ x(inode_pos_blockdev_range, 192, 0) \ x(inode_alloc_cursor_inode_bad, 301, 0) \ @@ -216,6 +217,7 @@ enum bch_fsck_flags { x(inode_str_hash_invalid, 194, 0) \ x(inode_v3_fields_start_bad, 195, 0) \ x(inode_snapshot_mismatch, 196, 0) \ + x(snapshot_key_missing_inode_snapshot, 314, FSCK_AUTOFIX) \ x(inode_unlinked_but_clean, 197, 0) \ x(inode_unlinked_but_nlink_nonzero, 198, 0) \ x(inode_unlinked_and_not_open, 281, 0) \ @@ -236,6 +238,9 @@ enum bch_fsck_flags { x(inode_has_child_snapshots_wrong, 287, 0) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \ x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ + x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \ + x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ + x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ @@ -320,7 +325,7 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 311, 0) + x(MAX, 316, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c index e810d85c..9c383d9a 100644 --- a/libbcachefs/sb-members.c +++ b/libbcachefs/sb-members.c @@ -139,6 +139,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) struct bch_sb_field_members_v1 *mi1; struct bch_sb_field_members_v2 *mi2; + if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) { + bch2_sb_field_resize(disk_sb, members_v1, 0); + return 0; + } + mi1 = bch2_sb_field_resize(disk_sb, members_v1, DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * disk_sb->sb->nr_devices, sizeof(u64))); diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index 94cf60f7..f074b9de 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_key_cache.h" @@ -212,7 +213,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", BCH_SNAPSHOT_SUBVOL(s.v), - BCH_SNAPSHOT_DELETED(s.v), + BCH_SNAPSHOT_WILL_DELETE(s.v), le32_to_cpu(s.v->parent), le32_to_cpu(s.v->children[0]), le32_to_cpu(s.v->children[1]), @@ -313,7 +314,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - t->live = true; + t->state = !BCH_SNAPSHOT_DELETED(s.v) + ? SNAPSHOT_ID_live + : SNAPSHOT_ID_deleted; t->parent = le32_to_cpu(s.v->parent); t->children[0] = le32_to_cpu(s.v->children[0]); t->children[1] = le32_to_cpu(s.v->children[1]); @@ -338,7 +341,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, parent - id - 1 < IS_ANCESTOR_BITMAP) __set_bit(parent - id - 1, t->is_ancestor); - if (BCH_SNAPSHOT_DELETED(s.v)) { + if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); @@ -710,6 +713,9 @@ static int check_snapshot(struct btree_trans *trans, memset(&s, 0, sizeof(s)); memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); + if (BCH_SNAPSHOT_DELETED(&s)) + return 0; + id = le32_to_cpu(s.parent); if (id) { ret = bch2_snapshot_lookup(trans, id, &v); @@ -747,7 +753,7 @@ static int check_snapshot(struct btree_trans *trans, } bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && - !BCH_SNAPSHOT_DELETED(&s); + !BCH_SNAPSHOT_WILL_DELETE(&s); if (should_have_subvol) { id = le32_to_cpu(s.subvol); @@ -997,7 +1003,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) snapshot_id_list_to_text(&buf, t); darray_for_each(*t, id) { - if (fsck_err_on(!bch2_snapshot_exists(c, *id), + if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty, trans, snapshot_node_missing, "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { @@ -1022,22 +1028,38 @@ err: return ret; } -int bch2_check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +int __bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0; + enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot); + + /* Snapshot was definitively deleted, this error is marked autofix */ + if (fsck_err_on(state == SNAPSHOT_ID_deleted, + trans, bkey_in_deleted_snapshot, + "key in deleted snapshot %s, delete?", + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; - if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), + /* + * Snapshot missing: we should have caught this with btree_lost_data and + * kicked off reconstruct_snapshots, so if we end up here we have no + * idea what happened: + */ + if (fsck_err_on(state == SNAPSHOT_ID_empty, trans, bkey_in_missing_snapshot, "key in missing snapshot %s, delete?", (bch2_btree_id_to_text(&buf, iter->btree_id), prt_char(&buf, ' '), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; + BTREE_UPDATE_internal_snapshot_node) ?: 1; fsck_err: printbuf_exit(&buf); return ret; @@ -1061,10 +1083,10 @@ int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) } /* already deleted? */ - if (BCH_SNAPSHOT_DELETED(&s->v)) + if (BCH_SNAPSHOT_WILL_DELETE(&s->v)) goto err; - SET_BCH_SNAPSHOT_DELETED(&s->v, true); + SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true); SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); s->v.subvol = 0; err: @@ -1084,24 +1106,25 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) struct btree_iter iter, p_iter = {}; struct btree_iter c_iter = {}; struct btree_iter tree_iter = {}; - struct bkey_s_c_snapshot s; u32 parent_id, child_id; unsigned i; int ret = 0; - s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_intent, snapshot); - ret = bkey_err(s); + struct bkey_i_snapshot *s = + bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_intent, snapshot); + ret = PTR_ERR_OR_ZERO(s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "missing snapshot %u", id); if (ret) goto err; - BUG_ON(s.v->children[1]); + BUG_ON(BCH_SNAPSHOT_DELETED(&s->v)); + BUG_ON(s->v.children[1]); - parent_id = le32_to_cpu(s.v->parent); - child_id = le32_to_cpu(s.v->children[0]); + parent_id = le32_to_cpu(s->v.parent); + child_id = le32_to_cpu(s->v.children[0]); if (parent_id) { struct bkey_i_snapshot *parent; @@ -1159,24 +1182,38 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) */ struct bkey_i_snapshot_tree *s_t; - BUG_ON(s.v->children[1]); + BUG_ON(s->v.children[1]); s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, - BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), + BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)), 0, snapshot_tree); ret = PTR_ERR_OR_ZERO(s_t); if (ret) goto err; - if (s.v->children[0]) { - s_t->v.root_snapshot = s.v->children[0]; + if (s->v.children[0]) { + s_t->v.root_snapshot = s->v.children[0]; } else { s_t->k.type = KEY_TYPE_deleted; set_bkey_val_u64s(&s_t->k, 0); } } - ret = bch2_btree_delete_at(trans, &iter, 0); + if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) { + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + s->v.parent = 0; + s->v.children[0] = 0; + s->v.children[1] = 0; + s->v.subvol = 0; + s->v.tree = 0; + s->v.depth = 0; + s->v.skip[0] = 0; + s->v.skip[1] = 0; + s->v.skip[2] = 0; + } else { + s->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s->k, 0); + } err: bch2_trans_iter_exit(trans, &tree_iter); bch2_trans_iter_exit(trans, &p_iter); @@ -1346,12 +1383,6 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -struct snapshot_interior_delete { - u32 id; - u32 live_child; -}; -typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; - static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) { darray_for_each(*l, i) @@ -1385,28 +1416,28 @@ static unsigned __live_child(struct snapshot_table *t, u32 id, return 0; } -static unsigned live_child(struct bch_fs *c, u32 id, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) +static unsigned live_child(struct bch_fs *c, u32 id) { + struct snapshot_delete *d = &c->snapshot_delete; + rcu_read_lock(); u32 ret = __live_child(rcu_dereference(c->snapshots), id, - delete_leaves, delete_interior); + &d->delete_leaves, &d->delete_interior); rcu_read_unlock(); return ret; } static int delete_dead_snapshots_process_key(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) + struct bkey_s_c k) { - if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) + struct snapshot_delete *d = &trans->c->snapshot_delete; + + if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot)) return bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); + u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot); if (live_child) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); int ret = PTR_ERR_OR_ZERO(new); @@ -1437,46 +1468,70 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, return 0; } +static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + + bool ret = !snapshot_list_has_id(&d->deleting_from_trees, + bch2_snapshot_tree(c, iter->pos.snapshot)); + if (unlikely(ret)) { + struct bpos pos = iter->pos; + pos.snapshot = 0; + if (iter->btree_id != BTREE_ID_inodes) + pos.offset = U64_MAX; + bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos)); + } + + return ret; +} + /* * For a given snapshot, if it doesn't have a subvolume that points to it, and * it doesn't have child snapshot nodes - it's now redundant and we can mark it * as deleted. */ -static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) +static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k) { if (k.k->type != KEY_TYPE_snapshot) return 0; struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); unsigned live_children = 0; if (BCH_SNAPSHOT_SUBVOL(s.v)) return 0; + if (BCH_SNAPSHOT_DELETED(s.v)) + return 0; + for (unsigned i = 0; i < 2; i++) { u32 child = le32_to_cpu(s.v->children[i]); live_children += child && - !snapshot_list_has_id(delete_leaves, child); + !snapshot_list_has_id(&d->delete_leaves, child); } + u32 tree = bch2_snapshot_tree(c, s.k->p.offset); + if (live_children == 0) { - return snapshot_list_add(c, delete_leaves, s.k->p.offset); + return snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: + snapshot_list_add(c, &d->delete_leaves, s.k->p.offset); } else if (live_children == 1) { - struct snapshot_interior_delete d = { + struct snapshot_interior_delete n = { .id = s.k->p.offset, - .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), + .live_child = live_child(c, s.k->p.offset), }; - if (!d.live_child) { - bch_err(c, "error finding live child of snapshot %u", d.id); + if (!n.live_child) { + bch_err(c, "error finding live child of snapshot %u", n.id); return -EINVAL; } - return darray_push(delete_interior, d); + return snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: + darray_push(&d->delete_interior, n); } else { return 0; } @@ -1508,6 +1563,9 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct bkey_i_snapshot *s; int ret; + if (!bch2_snapshot_exists(c, k.k->p.offset)) + return 0; + if (k.k->type != KEY_TYPE_snapshot) return 0; @@ -1555,39 +1613,52 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, return bch2_trans_update(trans, iter, &s->k_i, 0); } +static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d) +{ + prt_printf(out, "deleting from trees"); + darray_for_each(d->deleting_from_trees, i) + prt_printf(out, " %u", *i); + + prt_printf(out, "deleting leaves"); + darray_for_each(d->delete_leaves, i) + prt_printf(out, " %u", *i); + + prt_printf(out, " interior"); + darray_for_each(d->delete_interior, i) + prt_printf(out, " %u->%u", i->id, i->live_child); +} + int bch2_delete_dead_snapshots(struct bch_fs *c) { if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; struct btree_trans *trans = bch2_trans_get(c); - snapshot_id_list delete_leaves = {}; - interior_delete_list delete_interior = {}; + struct snapshot_delete *d = &c->snapshot_delete; int ret = 0; /* * For every snapshot node: If we have no live children and it's not * pointed to by a subvolume, delete it: */ + mutex_lock(&d->lock); + d->running = true; + d->pos = BBPOS_MIN; + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, - check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); + check_should_delete_snapshot(trans, k)); + mutex_unlock(&d->lock); if (!bch2_err_matches(ret, EROFS)) bch_err_msg(c, ret, "walking snapshots"); if (ret) goto err; - if (!delete_leaves.nr && !delete_interior.nr) + if (!d->delete_leaves.nr && !d->delete_interior.nr) goto err; { struct printbuf buf = PRINTBUF; - prt_printf(&buf, "deleting leaves"); - darray_for_each(delete_leaves, i) - prt_printf(&buf, " %u", *i); - - prt_printf(&buf, " interior"); - darray_for_each(delete_interior, i) - prt_printf(&buf, " %u->%u", i->id, i->live_child); + bch2_snapshot_delete_nodes_to_text(&buf, d); ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); printbuf_exit(&buf); @@ -1595,19 +1666,25 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } - for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { + for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { struct disk_reservation res = { 0 }; - if (!btree_type_has_snapshots(btree)) + d->pos.pos = POS_MIN; + + if (!btree_type_has_snapshots(d->pos.btree)) continue; ret = for_each_btree_key_commit(trans, iter, - btree, POS_MIN, + d->pos.btree, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, - delete_dead_snapshots_process_key(trans, &iter, k, - &delete_leaves, - &delete_interior)); + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter)) + continue; + + delete_dead_snapshots_process_key(trans, &iter, k); + })); bch2_disk_reservation_put(c, &res); @@ -1617,7 +1694,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } - darray_for_each(delete_leaves, i) { + darray_for_each(d->delete_leaves, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, *i)); if (!bch2_err_matches(ret, EROFS)) @@ -1634,11 +1711,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior)); if (ret) goto err; - darray_for_each(delete_interior, i) { + darray_for_each(d->delete_interior, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, i->id)); if (!bch2_err_matches(ret, EROFS)) @@ -1647,8 +1724,12 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } err: - darray_exit(&delete_interior); - darray_exit(&delete_leaves); + mutex_lock(&d->lock); + darray_exit(&d->deleting_from_trees); + darray_exit(&d->delete_interior); + darray_exit(&d->delete_leaves); + d->running = false; + mutex_unlock(&d->lock); bch2_trans_put(trans); if (!bch2_err_matches(ret, EROFS)) bch_err_fn(c, ret); @@ -1657,7 +1738,7 @@ err: void bch2_delete_dead_snapshots_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work); set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); @@ -1672,10 +1753,27 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *c) BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) + if (!queue_work(c->write_ref_wq, &c->snapshot_delete.work)) enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); } +void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct snapshot_delete *d = &c->snapshot_delete; + + if (!d->running) { + prt_str(out, "(not running)"); + return; + } + + mutex_lock(&d->lock); + bch2_snapshot_delete_nodes_to_text(out, d); + prt_newline(out); + mutex_unlock(&d->lock); + + bch2_bbpos_to_text(out, d->pos); +} + int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, enum btree_id id, struct bpos pos) @@ -1714,7 +1812,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct return 0; struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || + if (BCH_SNAPSHOT_WILL_DELETE(snap.v) || interior_snapshot_needs_delete(snap)) set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); @@ -1750,3 +1848,10 @@ void bch2_fs_snapshots_exit(struct bch_fs *c) { kvfree(rcu_dereference_protected(c->snapshots, true)); } + +void bch2_fs_snapshots_init_early(struct bch_fs *c) +{ + INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work); + mutex_init(&c->snapshot_delete.lock); + mutex_init(&c->snapshots_unlinked_lock); +} diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h index 81180181..69c484b7 100644 --- a/libbcachefs/snapshot.h +++ b/libbcachefs/snapshot.h @@ -120,21 +120,26 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) return id; } -static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) +static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id) { const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->live : 0; + return s ? s->state : SNAPSHOT_ID_empty; } -static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) +static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) { rcu_read_lock(); - bool ret = __bch2_snapshot_exists(c, id); + enum snapshot_id_state ret = __bch2_snapshot_id_state(c, id); rcu_read_unlock(); return ret; } +static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live; +} + static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { rcu_read_lock(); @@ -241,10 +246,19 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); int bch2_reconstruct_snapshots(struct bch_fs *); -int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); + +int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); + +static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot)) + ? 0 + : __bch2_check_key_has_snapshot(trans, iter, k); +} int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); -void bch2_delete_dead_snapshots_work(struct work_struct *); int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); @@ -259,7 +273,13 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return __bch2_key_has_snapshot_overwrites(trans, id, pos); } +int bch2_delete_dead_snapshots(struct bch_fs *); +void bch2_delete_dead_snapshots_work(struct work_struct *); +void bch2_delete_dead_snapshots_async(struct bch_fs *); +void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *); + int bch2_snapshots_read(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); +void bch2_fs_snapshots_init_early(struct bch_fs *); #endif /* _BCACHEFS_SNAPSHOT_H */ diff --git a/libbcachefs/snapshot_format.h b/libbcachefs/snapshot_format.h index aabcd3a7..9bccae1f 100644 --- a/libbcachefs/snapshot_format.h +++ b/libbcachefs/snapshot_format.h @@ -15,10 +15,10 @@ struct bch_snapshot { bch_le128 btime; }; -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) - +LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) /* True if a subvolume points to this snapshot node: */ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3) /* * Snapshot trees: diff --git a/libbcachefs/snapshot_types.h b/libbcachefs/snapshot_types.h new file mode 100644 index 00000000..a64f4b94 --- /dev/null +++ b/libbcachefs/snapshot_types.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_TYPES_H +#define _BCACHEFS_SNAPSHOT_TYPES_H + +#include "bbpos_types.h" +#include "darray.h" +#include "subvolume_types.h" + +typedef DARRAY(u32) snapshot_id_list; + +#define IS_ANCESTOR_BITMAP 128 + +struct snapshot_t { + enum snapshot_id_state { + SNAPSHOT_ID_empty, + SNAPSHOT_ID_live, + SNAPSHOT_ID_deleted, + } state; + u32 parent; + u32 skip[3]; + u32 depth; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 tree; + unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; +}; + +struct snapshot_table { + struct rcu_head rcu; + size_t nr; +#ifndef RUST_BINDGEN + DECLARE_FLEX_ARRAY(struct snapshot_t, s); +#else + struct snapshot_t s[0]; +#endif +}; + +struct snapshot_interior_delete { + u32 id; + u32 live_child; +}; +typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; + +struct snapshot_delete { + struct work_struct work; + + struct mutex lock; + snapshot_id_list deleting_from_trees; + snapshot_id_list delete_leaves; + interior_delete_list delete_interior; + + bool running; + struct bbpos pos; +}; + +#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */ diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 51ab2ee1..3c6ba146 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -730,8 +730,6 @@ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) void bch2_fs_subvolumes_init_early(struct bch_fs *c) { - INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, bch2_subvolume_wait_for_pagecache_and_delete); - mutex_init(&c->snapshots_unlinked_lock); } diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index ee5e4e5a..075f55e2 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -77,9 +77,6 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btr _end, _subvolid, _flags, _k, _do); \ }) -int bch2_delete_dead_snapshots(struct bch_fs *); -void bch2_delete_dead_snapshots_async(struct bch_fs *); - int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h index 1549d6da..9d634b90 100644 --- a/libbcachefs/subvolume_types.h +++ b/libbcachefs/subvolume_types.h @@ -2,33 +2,6 @@ #ifndef _BCACHEFS_SUBVOLUME_TYPES_H #define _BCACHEFS_SUBVOLUME_TYPES_H -#include "darray.h" - -typedef DARRAY(u32) snapshot_id_list; - -#define IS_ANCESTOR_BITMAP 128 - -struct snapshot_t { - bool live; - u32 parent; - u32 skip[3]; - u32 depth; - u32 children[2]; - u32 subvol; /* Nonzero only if a subvolume points to this node: */ - u32 tree; - unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; -}; - -struct snapshot_table { - struct rcu_head rcu; - size_t nr; -#ifndef RUST_BINDGEN - DECLARE_FLEX_ARRAY(struct snapshot_t, s); -#else - struct snapshot_t s[0]; -#endif -}; - typedef struct { /* we can't have padding in this struct: */ u64 subvol; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 0f9c38f8..96264686 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -864,6 +864,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch2_fs_quota_init(c); bch2_fs_rebalance_init(c); bch2_fs_sb_errors_init_early(c); + bch2_fs_snapshots_init_early(c); bch2_fs_subvolumes_init_early(c); INIT_LIST_HEAD(&c->list); @@ -1488,7 +1489,9 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, { ca->dev_idx = dev_idx; __set_bit(ca->dev_idx, ca->self.d); - scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); + + if (!ca->name[0]) + scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ca->fs = c; rcu_assign_pointer(c->devs[ca->dev_idx], ca); @@ -1540,6 +1543,11 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) if (ret) return ret; + struct printbuf name = PRINTBUF; + prt_bdevname(&name, sb->bdev); + strscpy(ca->name, name.buf, sizeof(ca->name)); + printbuf_exit(&name); + /* Commit: */ ca->disk_sb = *sb; memset(sb, 0, sizeof(*sb)); @@ -1581,11 +1589,6 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) bch2_dev_sysfs_online(c, ca); - struct printbuf name = PRINTBUF; - prt_bdevname(&name, ca->disk_sb.bdev); - strscpy(ca->name, name.buf, sizeof(ca->name)); - printbuf_exit(&name); - bch2_rebalance_wakeup(c); return 0; } diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 7c840b47..eafaa2c8 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -147,8 +147,9 @@ write_attribute(trigger_journal_flush); write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); write_attribute(trigger_btree_key_cache_shrink); -write_attribute(trigger_freelist_wakeup); write_attribute(trigger_btree_updates); +write_attribute(trigger_freelist_wakeup); +write_attribute(trigger_recalc_capacity); read_attribute(gc_gens_pos); __sysfs_attribute(read_fua_test, 0400); @@ -199,6 +200,7 @@ read_attribute(copy_gc_wait); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); +read_attribute(snapshot_delete_status); read_attribute(new_stripes); @@ -431,6 +433,9 @@ SHOW(bch2_fs) if (attr == &sysfs_rebalance_status) bch2_rebalance_status_to_text(out, c); + if (attr == &sysfs_snapshot_delete_status) + bch2_snapshot_delete_status_to_text(out, c); + /* Debugging: */ if (attr == &sysfs_journal_debug) @@ -540,6 +545,12 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_freelist_wakeup) closure_wake_up(&c->freelist_wait); + if (attr == &sysfs_trigger_recalc_capacity) { + down_read(&c->state_lock); + bch2_recalc_capacity(c); + up_read(&c->state_lock); + } + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -571,6 +582,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_btree_write_stats, &sysfs_rebalance_status, + &sysfs_snapshot_delete_status, &sysfs_compression_stats, @@ -665,8 +677,9 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_journal_writes, &sysfs_trigger_btree_cache_shrink, &sysfs_trigger_btree_key_cache_shrink, - &sysfs_trigger_freelist_wakeup, &sysfs_trigger_btree_updates, + &sysfs_trigger_freelist_wakeup, + &sysfs_trigger_recalc_capacity, &sysfs_gc_gens_pos, diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 3d324e48..ea3f87f6 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -38,7 +38,7 @@ static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); return bch2_xattr_hash(info, - &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); + &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len)); } static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) @@ -48,7 +48,7 @@ static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) return l.v->x_type != r->type || l.v->x_name_len != r->name.len || - memcmp(l.v->x_name, r->name.name, r->name.len); + memcmp(l.v->x_name_and_value, r->name.name, r->name.len); } static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) @@ -58,7 +58,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) return l.v->x_type != r.v->x_type || l.v->x_name_len != r.v->x_name_len || - memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); + memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len); } const struct bch_hash_desc bch2_xattr_hash_desc = { @@ -96,7 +96,7 @@ int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, c, xattr_invalid_type, "invalid type (%u)", xattr.v->x_type); - bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), + bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len), c, xattr_name_invalid_chars, "xattr name has invalid characters"); fsck_err: @@ -120,13 +120,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, unsigned name_len = xattr.v->x_name_len; unsigned val_len = le16_to_cpu(xattr.v->x_val_len); unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - - offsetof(struct bch_xattr, x_name); + offsetof(struct bch_xattr, x_name_and_value); val_len = min_t(int, val_len, max_name_val_bytes - name_len); name_len = min(name_len, max_name_val_bytes); prt_printf(out, "%.*s:%.*s", - name_len, xattr.v->x_name, + name_len, xattr.v->x_name_and_value, val_len, (char *) xattr_val(xattr.v)); if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || @@ -176,6 +176,11 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, if (ret) return ret; + /* + * Besides the ctime update, extents, dirents and xattrs updates require + * that an inode update also happens - to ensure that if a key exists in + * one of those btrees with a given snapshot ID an inode is also present + */ inode_u->bi_ctime = bch2_current_time(c); ret = bch2_inode_write(trans, &inode_iter, inode_u); @@ -202,7 +207,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, xattr->v.x_type = type; xattr->v.x_name_len = namelen; xattr->v.x_val_len = cpu_to_le16(size); - memcpy(xattr->v.x_name, name, namelen); + memcpy(xattr->v.x_name_and_value, name, namelen); memcpy(xattr_val(&xattr->v), value, size); ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, @@ -270,7 +275,7 @@ static int bch2_xattr_emit(struct dentry *dentry, if (!prefix) return 0; - return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf); + return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf); } static int bch2_xattr_list_bcachefs(struct bch_fs *c, diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h index 132fbbd1..1139bf34 100644 --- a/libbcachefs/xattr.h +++ b/libbcachefs/xattr.h @@ -18,12 +18,12 @@ void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) { - return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + + return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) + name_len + val_len, sizeof(u64)); } #define xattr_val(_xattr) \ - ((void *) (_xattr)->x_name + (_xattr)->x_name_len) + ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len) struct xattr_search_key { u8 type; diff --git a/libbcachefs/xattr_format.h b/libbcachefs/xattr_format.h index c7916011..4121b78d 100644 --- a/libbcachefs/xattr_format.h +++ b/libbcachefs/xattr_format.h @@ -13,7 +13,13 @@ struct bch_xattr { __u8 x_type; __u8 x_name_len; __le16 x_val_len; - __u8 x_name[] __counted_by(x_name_len); + /* + * x_name contains the name and value counted by + * x_name_len + x_val_len. The introduction of + * __counted_by(x_name_len) previously caused a false positive + * detection of an out of bounds write. + */ + __u8 x_name_and_value[]; } __packed __aligned(8); #endif /* _BCACHEFS_XATTR_FORMAT_H */ |