summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2024-08-08 11:42:15 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2024-08-08 11:51:07 -0400
commit95e8cddd3bdf4a68b4f87472e4063239d978bfc9 (patch)
treeec4d365e70df54543d5cbd05189081108fb818da
parentda1d5571f5ca526100797b47f9496cf2dcd03e38 (diff)
Update bcachefs sources to dfc7b2c9433e bcachefs: bch2_sb_nr_devices()
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/linux/min_heap.h236
-rw-r--r--include/linux/poison.h13
-rw-r--r--libbcachefs/acl.c13
-rw-r--r--libbcachefs/acl.h2
-rw-r--r--libbcachefs/alloc_background.h12
-rw-r--r--libbcachefs/alloc_foreground.c43
-rw-r--r--libbcachefs/alloc_foreground.h9
-rw-r--r--libbcachefs/bcachefs.h4
-rw-r--r--libbcachefs/bcachefs_format.h4
-rw-r--r--libbcachefs/btree_iter.c6
-rw-r--r--libbcachefs/btree_iter.h18
-rw-r--r--libbcachefs/btree_update_interior.c2
-rw-r--r--libbcachefs/clock.c50
-rw-r--r--libbcachefs/clock.h9
-rw-r--r--libbcachefs/clock_types.h2
-rw-r--r--libbcachefs/dirent.c66
-rw-r--r--libbcachefs/ec.c110
-rw-r--r--libbcachefs/ec_types.h2
-rw-r--r--libbcachefs/extents.h4
-rw-r--r--libbcachefs/fs-io-buffered.c41
-rw-r--r--libbcachefs/fs-io-direct.c2
-rw-r--r--libbcachefs/fs-io-pagecache.c90
-rw-r--r--libbcachefs/fs-io.c178
-rw-r--r--libbcachefs/fs-ioctl.c4
-rw-r--r--libbcachefs/fs.c217
-rw-r--r--libbcachefs/fs.h18
-rw-r--r--libbcachefs/fsck.c7
-rw-r--r--libbcachefs/io_misc.c6
-rw-r--r--libbcachefs/io_read.c7
-rw-r--r--libbcachefs/io_write.c5
-rw-r--r--libbcachefs/journal_reclaim.c1
-rw-r--r--libbcachefs/opts.h10
-rw-r--r--libbcachefs/sb-members.c9
-rw-r--r--libbcachefs/sb-members.h2
-rw-r--r--libbcachefs/subvolume.h45
-rw-r--r--libbcachefs/super-io.c16
-rw-r--r--libbcachefs/super.c4
-rw-r--r--libbcachefs/sysfs.c14
-rw-r--r--libbcachefs/util.c2
-rw-r--r--libbcachefs/util.h118
-rw-r--r--libbcachefs/xattr.c55
42 files changed, 800 insertions, 658 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 654eedda..fafb2859 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-1592eaa60418d460021ecf44bc405ec49ef14adf
+dfc7b2c9433ed926cad881b8fbee55e36105989b
diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
new file mode 100644
index 00000000..43a7b9dc
--- /dev/null
+++ b/include/linux/min_heap.h
@@ -0,0 +1,236 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MIN_HEAP_H
+#define _LINUX_MIN_HEAP_H
+
+#include <linux/bug.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+/**
+ * Data structure to hold a min-heap.
+ * @nr: Number of elements currently in the heap.
+ * @size: Maximum number of elements that can be held in current storage.
+ * @data: Pointer to the start of array holding the heap elements.
+ * @preallocated: Start of the static preallocated array holding the heap elements.
+ */
+#define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \
+struct _name { \
+ int nr; \
+ int size; \
+ _type *data; \
+ _type preallocated[_nr]; \
+}
+
+#define DEFINE_MIN_HEAP(_type, _name) MIN_HEAP_PREALLOCATED(_type, _name, 0)
+
+typedef DEFINE_MIN_HEAP(char, min_heap_char) min_heap_char;
+
+#define __minheap_cast(_heap) (typeof((_heap)->data[0]) *)
+#define __minheap_obj_size(_heap) sizeof((_heap)->data[0])
+
+/**
+ * struct min_heap_callbacks - Data/functions to customise the min_heap.
+ * @less: Partial order function for this heap.
+ * @swp: Swap elements function.
+ */
+struct min_heap_callbacks {
+ bool (*less)(const void *lhs, const void *rhs, void *args);
+ void (*swp)(void *lhs, void *rhs, void *args);
+};
+
+/* Initialize a min-heap. */
+static __always_inline
+void __min_heap_init(min_heap_char *heap, void *data, int size)
+{
+ heap->nr = 0;
+ heap->size = size;
+ if (data)
+ heap->data = data;
+ else
+ heap->data = heap->preallocated;
+}
+
+#define min_heap_init(_heap, _data, _size) \
+ __min_heap_init((min_heap_char *)_heap, _data, _size)
+
+/* Get the minimum element from the heap. */
+static __always_inline
+void *__min_heap_peek(struct min_heap_char *heap)
+{
+ return heap->nr ? heap->data : NULL;
+}
+
+#define min_heap_peek(_heap) \
+ (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap))
+
+/* Check if the heap is full. */
+static __always_inline
+bool __min_heap_full(min_heap_char *heap)
+{
+ return heap->nr == heap->size;
+}
+
+#define min_heap_full(_heap) \
+ __min_heap_full((min_heap_char *)_heap)
+
+/* Sift the element at pos down the heap. */
+static __always_inline
+void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ void *left, *right;
+ void *data = heap->data;
+ void *root = data + pos * elem_size;
+ int i = pos, j;
+
+ /* Find the sift-down path all the way to the leaves. */
+ for (;;) {
+ if (i * 2 + 2 >= heap->nr)
+ break;
+ left = data + (i * 2 + 1) * elem_size;
+ right = data + (i * 2 + 2) * elem_size;
+ i = func->less(left, right, args) ? i * 2 + 1 : i * 2 + 2;
+ }
+
+ /* Special case for the last leaf with no sibling. */
+ if (i * 2 + 2 == heap->nr)
+ i = i * 2 + 1;
+
+ /* Backtrack to the correct location. */
+ while (i != pos && func->less(root, data + i * elem_size, args))
+ i = (i - 1) / 2;
+
+ /* Shift the element into its correct place. */
+ j = i;
+ while (i != pos) {
+ i = (i - 1) / 2;
+ func->swp(data + i * elem_size, data + j * elem_size, args);
+ }
+}
+
+#define min_heap_sift_down(_heap, _pos, _func, _args) \
+ __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args)
+
+/* Sift up ith element from the heap, O(log2(nr)). */
+static __always_inline
+void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args)
+{
+ void *data = heap->data;
+ size_t parent;
+
+ while (idx) {
+ parent = (idx - 1) / 2;
+ if (func->less(data + parent * elem_size, data + idx * elem_size, args))
+ break;
+ func->swp(data + parent * elem_size, data + idx * elem_size, args);
+ idx = parent;
+ }
+}
+
+#define min_heap_sift_up(_heap, _idx, _func, _args) \
+ __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args)
+
+/* Floyd's approach to heapification that is O(nr). */
+static __always_inline
+void __min_heapify_all(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ int i;
+
+ for (i = heap->nr / 2 - 1; i >= 0; i--)
+ __min_heap_sift_down(heap, i, elem_size, func, args);
+}
+
+#define min_heapify_all(_heap, _func, _args) \
+ __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args)
+
+/* Remove minimum element from the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_pop(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ void *data = heap->data;
+
+ if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
+ return false;
+
+ /* Place last element at the root (position 0) and then sift down. */
+ heap->nr--;
+ memcpy(data, data + (heap->nr * elem_size), elem_size);
+ __min_heap_sift_down(heap, 0, elem_size, func, args);
+
+ return true;
+}
+
+#define min_heap_pop(_heap, _func, _args) \
+ __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args)
+
+/*
+ * Remove the minimum element and then push the given element. The
+ * implementation performs 1 sift (O(log2(nr))) and is therefore more
+ * efficient than a pop followed by a push that does 2.
+ */
+static __always_inline
+void __min_heap_pop_push(min_heap_char *heap,
+ const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func,
+ void *args)
+{
+ memcpy(heap->data, element, elem_size);
+ __min_heap_sift_down(heap, 0, elem_size, func, args);
+}
+
+#define min_heap_pop_push(_heap, _element, _func, _args) \
+ __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args)
+
+/* Push an element on to the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ void *data = heap->data;
+ int pos;
+
+ if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap"))
+ return false;
+
+ /* Place at the end of data. */
+ pos = heap->nr;
+ memcpy(data + (pos * elem_size), element, elem_size);
+ heap->nr++;
+
+ /* Sift child at pos up. */
+ __min_heap_sift_up(heap, elem_size, pos, func, args);
+
+ return true;
+}
+
+#define min_heap_push(_heap, _element, _func, _args) \
+ __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args)
+
+/* Remove ith element from the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args)
+{
+ void *data = heap->data;
+
+ if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
+ return false;
+
+ /* Place last element at the root (position 0) and then sift down. */
+ heap->nr--;
+ if (idx == heap->nr)
+ return true;
+ func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args);
+ __min_heap_sift_up(heap, elem_size, idx, func, args);
+ __min_heap_sift_down(heap, idx, elem_size, func, args);
+
+ return true;
+}
+
+#define min_heap_del(_heap, _idx, _func, _args) \
+ __min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args)
+
+#endif /* _LINUX_MIN_HEAP_H */
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 1f0ee245..331a9a99 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -38,11 +38,8 @@
* Magic nums for obj red zoning.
* Placed in the first word before and the first word after an obj.
*/
-#define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */
-#define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */
-
-#define SLUB_RED_INACTIVE 0xbb
-#define SLUB_RED_ACTIVE 0xcc
+#define SLUB_RED_INACTIVE 0xbb /* when obj is inactive */
+#define SLUB_RED_ACTIVE 0xcc /* when obj is active */
/* ...and for poisoning */
#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */
@@ -52,12 +49,6 @@
/********** arch/$ARCH/mm/init.c **********/
#define POISON_FREE_INITMEM 0xcc
-/********** arch/ia64/hp/common/sba_iommu.c **********/
-/*
- * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
- * value of "SBAIOMMU POISON\0" for spill-over poisoning.
- */
-
/********** fs/jbd/journal.c **********/
#define JBD_POISON_FREE 0x5b
#define JBD2_POISON_FREE 0x5c
diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c
index a7b425d3..87f1be9d 100644
--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@@ -272,16 +272,19 @@ bch2_acl_to_xattr(struct btree_trans *trans,
return xattr;
}
-struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
- struct dentry *dentry, int type)
+struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
{
- struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+ struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
- struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct posix_acl *acl = NULL;
+
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ struct btree_trans *trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
@@ -358,7 +361,7 @@ retry:
bch2_trans_begin(trans);
acl = _acl;
- ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
BTREE_ITER_intent);
if (ret)
diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h
index 27e7eec0..fe730a6b 100644
--- a/libbcachefs/acl.h
+++ b/libbcachefs/acl.h
@@ -28,7 +28,7 @@ void bch2_acl_to_text(struct printbuf *, const void *, size_t);
#ifdef CONFIG_BCACHEFS_POSIX_ACL
-struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
+struct posix_acl *bch2_get_acl(struct inode *, int, bool);
int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index 8d2b62c9..96a0444e 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -82,6 +82,14 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
bucket_data_type(bucket) != bucket_data_type(ptr);
}
+/*
+ * It is my general preference to use unsigned types for unsigned quantities -
+ * however, these helpers are used in disk accounting calculations run by
+ * triggers where the output will be negated and added to an s64. unsigned is
+ * right out even though all these quantities will fit in 32 bits, since it
+ * won't be sign extended correctly; u64 will negate "correctly", but s64 is the
+ * simpler option here.
+ */
static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
{
return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
@@ -166,8 +174,8 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
* avoid overflowing LRU_TIME_BITS on a corrupted fs, when
* bucket_sectors_dirty is (much) bigger than bucket_size
*/
- u64 d = min(bch2_bucket_sectors_dirty(a),
- ca->mi.bucket_size);
+ u64 d = min_t(s64, bch2_bucket_sectors_dirty(a),
+ ca->mi.bucket_size);
return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
}
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index cabf866c..922bf2fb 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -496,12 +496,6 @@ again:
for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
alloc_cursor < k.k->p.offset;
alloc_cursor++) {
- ret = btree_trans_too_many_iters(trans);
- if (ret) {
- ob = ERR_PTR(ret);
- break;
- }
-
s->buckets_seen++;
u64 bucket = alloc_cursor & ~(~0ULL << 56);
@@ -1028,9 +1022,6 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
open_bucket_for_each(c, ptrs, ob, i)
__clear_bit(ob->dev, devs.d);
- if (erasure_code && ec_open_bucket(c, ptrs))
- return 0;
-
ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
have_cache, erasure_code, flags);
@@ -1085,7 +1076,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
{
int ret;
- if (erasure_code) {
+ if (erasure_code && !ec_open_bucket(trans->c, ptrs)) {
ret = __open_bucket_add_buckets(trans, ptrs, wp,
devs_have, target, erasure_code,
nr_replicas, nr_effective, have_cache,
@@ -1609,7 +1600,8 @@ void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct ope
prt_newline(out);
}
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bch_dev *ca)
{
struct open_bucket *ob;
@@ -1619,7 +1611,8 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
spin_lock(&ob->lock);
- if (ob->valid && !ob->on_partial_list)
+ if (ob->valid && !ob->on_partial_list &&
+ (!ca || ob->dev == ca->dev_idx))
bch2_open_bucket_to_text(out, c, ob);
spin_unlock(&ob->lock);
}
@@ -1762,11 +1755,12 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats));
}
-void bch2_print_allocator_stuck(struct bch_fs *c)
+static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
{
struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "Allocator stuck? Waited for 10 seconds\n");
+ prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n",
+ c->opts.allocator_stuck_timeout);
prt_printf(&buf, "Allocator debug:\n");
printbuf_indent_add(&buf, 2);
@@ -1796,3 +1790,24 @@ void bch2_print_allocator_stuck(struct bch_fs *c)
bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
+
+static inline unsigned allocator_wait_timeout(struct bch_fs *c)
+{
+ if (c->allocator_last_stuck &&
+ time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies))
+ return 0;
+
+ return c->opts.allocator_stuck_timeout * HZ;
+}
+
+void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
+{
+ unsigned t = allocator_wait_timeout(c);
+
+ if (t && closure_sync_timeout(cl, t)) {
+ c->allocator_last_stuck = jiffies;
+ bch2_print_allocator_stuck(c);
+ }
+
+ closure_sync(cl);
+}
diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h
index 6da9e7e2..386d231c 100644
--- a/libbcachefs/alloc_foreground.h
+++ b/libbcachefs/alloc_foreground.h
@@ -223,7 +223,7 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
void bch2_fs_allocator_foreground_init(struct bch_fs *);
void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *);
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *);
void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
@@ -231,6 +231,11 @@ void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *);
void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
-void bch2_print_allocator_stuck(struct bch_fs *);
+void __bch2_wait_on_allocator(struct bch_fs *, struct closure *);
+static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
+{
+ if (cl->closure_get_happened)
+ __bch2_wait_on_allocator(c, cl);
+}
#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 91361a16..43cfb5da 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -893,6 +893,8 @@ struct bch_fs {
struct bch_fs_usage_base __percpu *usage;
u64 __percpu *online_reserved;
+ unsigned long allocator_last_stuck;
+
struct io_clock io_clock[2];
/* JOURNAL SEQ BLACKLIST */
@@ -1020,6 +1022,7 @@ struct bch_fs {
/* fs.c */
struct list_head vfs_inodes_list;
struct mutex vfs_inodes_lock;
+ struct rhashtable vfs_inodes_table;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
@@ -1082,7 +1085,6 @@ struct bch_fs {
u64 __percpu *counters;
unsigned copy_gc_enabled:1;
- bool promote_whole_extents;
struct bch2_time_stats times[BCH_TIME_STAT_NR];
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 74a60b1a..7861473b 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -792,6 +792,8 @@ LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61);
LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63);
+LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
+ struct bch_sb, flags[0], 63, 64);
LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8);
@@ -836,6 +838,8 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
struct bch_sb, flags[5], 0, 16);
+LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
+ struct bch_sb, flags[5], 16, 32);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index fb04795c..afbb35ab 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -1934,6 +1934,11 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
bch2_trans_verify_not_in_restart(trans);
bch2_btree_iter_verify(iter);
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (ret)
+ goto err;
+
+
struct btree_path *path = btree_iter_path(trans, iter);
/* already at end? */
@@ -2720,6 +2725,7 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
return bch2_btree_iter_peek_slot(iter);
}
+/* Obsolete, but still used by rust wrapper in -tools */
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
{
struct bkey_s_c k;
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 6c7a5034..b241f544 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -811,20 +811,6 @@ transaction_restart: \
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
- struct btree_iter *iter, unsigned flags)
-{
- struct bkey_s_c k;
-
- while (btree_trans_too_many_iters(trans) ||
- (k = bch2_btree_iter_peek_type(iter, flags),
- bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(trans);
-
- return k;
-}
-
#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
@@ -865,7 +851,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
\
if (bch2_err_matches(_ret, ENOMEM)) { \
_gfp = GFP_KERNEL; \
- _ret = drop_locks_do(trans, _do); \
+ _ret = drop_locks_do(_trans, _do); \
} \
_ret; \
})
@@ -878,7 +864,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
_ret = 0; \
if (unlikely(!_p)) { \
_gfp = GFP_KERNEL; \
- _ret = drop_locks_do(trans, ((_p = _do), 0)); \
+ _ret = drop_locks_do(_trans, ((_p = _do), 0)); \
} \
_p; \
})
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index fbcfa5ce..b4f76784 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -1264,7 +1264,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
bch2_trans_unlock(trans);
- closure_sync(&cl);
+ bch2_wait_on_allocator(c, &cl);
} while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
}
diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c
index df3763c1..1d6b691e 100644
--- a/libbcachefs/clock.c
+++ b/libbcachefs/clock.c
@@ -6,15 +6,29 @@
#include <linux/kthread.h>
#include <linux/preempt.h>
-static inline long io_timer_cmp(io_timer_heap *h,
- struct io_timer *l,
- struct io_timer *r)
+static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args)
{
- return l->expire - r->expire;
+ struct io_timer **_l = (struct io_timer **)l;
+ struct io_timer **_r = (struct io_timer **)r;
+
+ return (*_l)->expire < (*_r)->expire;
+}
+
+static inline void io_timer_swp(void *l, void *r, void __always_unused *args)
+{
+ struct io_timer **_l = (struct io_timer **)l;
+ struct io_timer **_r = (struct io_timer **)r;
+
+ swap(*_l, *_r);
}
void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
{
+ const struct min_heap_callbacks callbacks = {
+ .less = io_timer_cmp,
+ .swp = io_timer_swp,
+ };
+
spin_lock(&clock->timer_lock);
if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) {
@@ -23,22 +37,27 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
return;
}
- for (size_t i = 0; i < clock->timers.used; i++)
+ for (size_t i = 0; i < clock->timers.nr; i++)
if (clock->timers.data[i] == timer)
goto out;
- BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL));
+ BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL));
out:
spin_unlock(&clock->timer_lock);
}
void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
{
+ const struct min_heap_callbacks callbacks = {
+ .less = io_timer_cmp,
+ .swp = io_timer_swp,
+ };
+
spin_lock(&clock->timer_lock);
- for (size_t i = 0; i < clock->timers.used; i++)
+ for (size_t i = 0; i < clock->timers.nr; i++)
if (clock->timers.data[i] == timer) {
- heap_del(&clock->timers, i, io_timer_cmp, NULL);
+ min_heap_del(&clock->timers, i, &callbacks, NULL);
break;
}
@@ -123,10 +142,17 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
{
struct io_timer *ret = NULL;
+ const struct min_heap_callbacks callbacks = {
+ .less = io_timer_cmp,
+ .swp = io_timer_swp,
+ };
+
+ if (clock->timers.nr &&
+ time_after_eq64(now, clock->timers.data[0]->expire)) {
+ ret = *min_heap_peek(&clock->timers);
+ min_heap_pop(&clock->timers, &callbacks, NULL);
+ }
- if (clock->timers.used &&
- time_after_eq64(now, clock->timers.data[0]->expire))
- heap_pop(&clock->timers, ret, io_timer_cmp, NULL);
return ret;
}
@@ -150,7 +176,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
printbuf_tabstop_push(out, 40);
prt_printf(out, "current time:\t%llu\n", now);
- for (unsigned i = 0; i < clock->timers.used; i++)
+ for (unsigned i = 0; i < clock->timers.nr; i++)
prt_printf(out, "%ps %ps:\t%llu\n",
clock->timers.data[i]->fn,
clock->timers.data[i]->fn2,
diff --git a/libbcachefs/clock.h b/libbcachefs/clock.h
index 85c975df..82c79c8b 100644
--- a/libbcachefs/clock.h
+++ b/libbcachefs/clock.h
@@ -20,15 +20,6 @@ static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
-#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
-({ \
- long __ret = timeout; \
- might_sleep(); \
- if (!___wait_cond_timeout(condition)) \
- __ret = __wait_event_timeout(wq, condition, timeout); \
- __ret; \
-})
-
void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
void bch2_io_clock_exit(struct io_clock *);
diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h
index 9c25d0fc..37554e45 100644
--- a/libbcachefs/clock_types.h
+++ b/libbcachefs/clock_types.h
@@ -24,7 +24,7 @@ struct io_timer {
/* Amount to buffer up on a percpu counter */
#define IO_CLOCK_PCPU_SECTORS 128
-typedef HEAP(struct io_timer *) io_timer_heap;
+typedef DEFINE_MIN_HEAP(struct io_timer *, io_timer_heap) io_timer_heap;
struct io_clock {
atomic64_t now;
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index d743da89..6c200401 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -553,62 +553,30 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- subvol_inum target;
- u32 snapshot;
struct bkey_buf sk;
- int ret;
-
bch2_bkey_buf_init(&sk);
-retry:
- bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(inum.inum, ctx->pos, snapshot),
- POS(inum.inum, U64_MAX), 0, k, ret) {
- if (k.k->type != KEY_TYPE_dirent)
- continue;
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents,
+ POS(inum.inum, ctx->pos),
+ POS(inum.inum, U64_MAX),
+ inum.subvol, 0, k, ({
+ if (k.k->type != KEY_TYPE_dirent)
+ continue;
- /* dir_emit() can fault and block: */
- bch2_bkey_buf_reassemble(&sk, c, k);
- struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
+ /* dir_emit() can fault and block: */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
- ret = bch2_dirent_read_target(trans, inum, dirent, &target);
- if (ret < 0)
- break;
- if (ret)
- continue;
+ subvol_inum target;
+ int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
+ if (ret2 > 0)
+ continue;
- /*
- * read_target looks up subvolumes, we can overflow paths if the
- * directory has many subvolumes in it
- *
- * XXX: btree_trans_too_many_iters() is something we'd like to
- * get rid of, and there's no good reason to be using it here
- * except that we don't yet have a for_each_btree_key() helper
- * that does subvolume_get_snapshot().
- */
- ret = drop_locks_do(trans,
- bch2_dir_emit(ctx, dirent, target)) ?:
- btree_trans_too_many_iters(trans);
- if (ret) {
- ret = ret < 0 ? ret : 0;
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
+ ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target));
+ })));
- bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
- return ret;
+ return ret < 0 ? ret : 0;
}
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 86948d11..84f1cbf6 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -901,8 +901,8 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
mutex_lock(&c->ec_stripes_heap_lock);
if (n.size > h->size) {
- memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
- n.used = h->used;
+ memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
+ n.nr = h->nr;
swap(*h, n);
}
mutex_unlock(&c->ec_stripes_heap_lock);
@@ -993,7 +993,7 @@ static u64 stripe_idx_to_delete(struct bch_fs *c)
lockdep_assert_held(&c->ec_stripes_heap_lock);
- if (h->used &&
+ if (h->nr &&
h->data[0].blocks_nonempty == 0 &&
!bch2_stripe_is_open(c, h->data[0].idx))
return h->data[0].idx;
@@ -1001,14 +1001,6 @@ static u64 stripe_idx_to_delete(struct bch_fs *c)
return 0;
}
-static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
- struct ec_stripe_heap_entry l,
- struct ec_stripe_heap_entry r)
-{
- return ((l.blocks_nonempty > r.blocks_nonempty) -
- (l.blocks_nonempty < r.blocks_nonempty));
-}
-
static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
size_t i)
{
@@ -1017,39 +1009,71 @@ static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
}
+static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
+{
+ struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
+ struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
+
+ return ((_l->blocks_nonempty > _r->blocks_nonempty) <
+ (_l->blocks_nonempty < _r->blocks_nonempty));
+}
+
+static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
+{
+ struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
+ struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
+ ec_stripes_heap *_h = (ec_stripes_heap *)h;
+ size_t i = _l - _h->data;
+ size_t j = _r - _h->data;
+
+ swap(*_l, *_r);
+
+ ec_stripes_heap_set_backpointer(_h, i);
+ ec_stripes_heap_set_backpointer(_h, j);
+}
+
static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
struct stripe *m = genradix_ptr(&c->stripes, idx);
- BUG_ON(m->heap_idx >= h->used);
+ BUG_ON(m->heap_idx >= h->nr);
BUG_ON(h->data[m->heap_idx].idx != idx);
}
void bch2_stripes_heap_del(struct bch_fs *c,
struct stripe *m, size_t idx)
{
+ const struct min_heap_callbacks callbacks = {
+ .less = ec_stripes_heap_cmp,
+ .swp = ec_stripes_heap_swap,
+ };
+
mutex_lock(&c->ec_stripes_heap_lock);
heap_verify_backpointer(c, idx);
- heap_del(&c->ec_stripes_heap, m->heap_idx,
- ec_stripes_heap_cmp,
- ec_stripes_heap_set_backpointer);
+ min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
mutex_unlock(&c->ec_stripes_heap_lock);
}
void bch2_stripes_heap_insert(struct bch_fs *c,
struct stripe *m, size_t idx)
{
+ const struct min_heap_callbacks callbacks = {
+ .less = ec_stripes_heap_cmp,
+ .swp = ec_stripes_heap_swap,
+ };
+
mutex_lock(&c->ec_stripes_heap_lock);
- BUG_ON(heap_full(&c->ec_stripes_heap));
+ BUG_ON(min_heap_full(&c->ec_stripes_heap));
- heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
+ genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
+ min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
.idx = idx,
.blocks_nonempty = m->blocks_nonempty,
}),
- ec_stripes_heap_cmp,
- ec_stripes_heap_set_backpointer);
+ &callbacks,
+ &c->ec_stripes_heap);
heap_verify_backpointer(c, idx);
mutex_unlock(&c->ec_stripes_heap_lock);
@@ -1058,6 +1082,10 @@ void bch2_stripes_heap_insert(struct bch_fs *c,
void bch2_stripes_heap_update(struct bch_fs *c,
struct stripe *m, size_t idx)
{
+ const struct min_heap_callbacks callbacks = {
+ .less = ec_stripes_heap_cmp,
+ .swp = ec_stripes_heap_swap,
+ };
ec_stripes_heap *h = &c->ec_stripes_heap;
bool do_deletes;
size_t i;
@@ -1068,10 +1096,8 @@ void bch2_stripes_heap_update(struct bch_fs *c,
h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
i = m->heap_idx;
- heap_sift_up(h, i, ec_stripes_heap_cmp,
- ec_stripes_heap_set_backpointer);
- heap_sift_down(h, i, ec_stripes_heap_cmp,
- ec_stripes_heap_set_backpointer);
+ min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap);
+ min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
heap_verify_backpointer(c, idx);
@@ -1783,6 +1809,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity);
BUG_ON(v->nr_redundant != h->s->nr_parity);
+ /* * We bypass the sector allocator which normally does this: */
+ bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
+
for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
__clear_bit(v->ptrs[i].dev, devs.d);
if (i < h->s->nr_data)
@@ -1864,7 +1893,7 @@ static s64 get_existing_stripe(struct bch_fs *c,
return -1;
mutex_lock(&c->ec_stripes_heap_lock);
- for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+ for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
/* No blocks worth reusing, stripe will just be deleted: */
if (!h->data[heap_idx].blocks_nonempty)
continue;
@@ -2195,7 +2224,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
size_t i;
mutex_lock(&c->ec_stripes_heap_lock);
- for (i = 0; i < min_t(size_t, h->used, 50); i++) {
+ for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
m = genradix_ptr(&c->stripes, h->data[i].idx);
prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
@@ -2209,6 +2238,23 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
mutex_unlock(&c->ec_stripes_heap_lock);
}
+static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
+ struct ec_stripe_new *s)
+{
+ prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
+ s->idx, s->nr_data, s->nr_parity,
+ bitmap_weight(s->blocks_allocated, s->nr_data),
+ atomic_read(&s->ref[STRIPE_REF_io]),
+ atomic_read(&s->ref[STRIPE_REF_stripe]),
+ bch2_watermarks[s->h->watermark]);
+
+ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+ unsigned i;
+ for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
+ prt_printf(out, " %u", s->blocks[i]);
+ prt_newline(out);
+}
+
void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
{
struct ec_stripe_head *h;
@@ -2221,23 +2267,15 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
bch2_watermarks[h->watermark]);
if (h->s)
- prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
- h->s->idx, h->s->nr_data, h->s->nr_parity,
- bitmap_weight(h->s->blocks_allocated,
- h->s->nr_data));
+ bch2_new_stripe_to_text(out, c, h->s);
}
mutex_unlock(&c->ec_stripe_head_lock);
prt_printf(out, "in flight:\n");
mutex_lock(&c->ec_stripe_new_lock);
- list_for_each_entry(s, &c->ec_stripe_new_list, list) {
- prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
- s->idx, s->nr_data, s->nr_parity,
- atomic_read(&s->ref[STRIPE_REF_io]),
- atomic_read(&s->ref[STRIPE_REF_stripe]),
- bch2_watermarks[s->h->watermark]);
- }
+ list_for_each_entry(s, &c->ec_stripe_new_list, list)
+ bch2_new_stripe_to_text(out, c, s);
mutex_unlock(&c->ec_stripe_new_lock);
}
diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h
index 976426da..1df03dcc 100644
--- a/libbcachefs/ec_types.h
+++ b/libbcachefs/ec_types.h
@@ -36,6 +36,6 @@ struct ec_stripe_heap_entry {
unsigned blocks_nonempty;
};
-typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
+typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap;
#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index facdb8a8..68f8c30b 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -357,7 +357,7 @@ out: \
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
_ptr, _entry)
-#define bkey_crc_next(_k, _start, _end, _crc, _iter) \
+#define bkey_crc_next(_k, _end, _crc, _iter) \
({ \
__bkey_extent_entry_for_each_from(_iter, _end, _iter) \
if (extent_entry_is_crc(_iter)) { \
@@ -372,7 +372,7 @@ out: \
#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \
for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \
(_iter) = (_start); \
- bkey_crc_next(_k, _start, _end, _crc, _iter); \
+ bkey_crc_next(_k, _end, _crc, _iter); \
(_iter) = extent_entry_next(_iter))
#define bkey_for_each_crc(_k, _p, _crc, _iter) \
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
index cc33d763..0b900a4a 100644
--- a/libbcachefs/fs-io-buffered.c
+++ b/libbcachefs/fs-io-buffered.c
@@ -151,7 +151,6 @@ static void bchfs_read(struct btree_trans *trans,
struct bkey_buf sk;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
- u32 snapshot;
int ret = 0;
rbio->c = c;
@@ -159,29 +158,23 @@ static void bchfs_read(struct btree_trans *trans,
rbio->subvol = inum.subvol;
bch2_bkey_buf_init(&sk);
-retry:
bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+ POS(inum.inum, rbio->bio.bi_iter.bi_sector),
BTREE_ITER_slots);
while (1) {
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
- /*
- * read_extent -> io_time_reset may cause a transaction restart
- * without returning an error, we need to check for that here:
- */
- ret = bch2_trans_relock(trans);
+ bch2_trans_begin(trans);
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
- break;
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
@@ -189,7 +182,7 @@ retry:
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
- break;
+ goto err;
offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
@@ -200,7 +193,7 @@ retry:
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &sk);
if (ret)
- break;
+ goto err;
k = bkey_i_to_s_c(sk.k);
@@ -210,7 +203,7 @@ retry:
ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
extent_partial_reads_expensive(k));
if (ret)
- break;
+ goto err;
}
bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
@@ -229,17 +222,13 @@ retry:
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
-
- ret = btree_trans_too_many_iters(trans);
- if (ret)
+err:
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
-err:
bch2_trans_iter_exit(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
if (ret) {
bch_err_inum_offset_ratelimited(c,
iter.pos.inode,
@@ -486,7 +475,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
- op->subvol = inode->ei_subvol;
+ op->subvol = inode->ei_inum.subvol;
op->pos = POS(inode->v.i_ino, sector);
op->end_io = bch2_writepage_io_done;
op->devs_need_flush = &inode->ei_devs_need_flush;
diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c
index e246b1e0..ee1c0325 100644
--- a/libbcachefs/fs-io-direct.c
+++ b/libbcachefs/fs-io-direct.c
@@ -500,7 +500,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
dio->op.target = dio->op.opts.foreground_target;
dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas;
- dio->op.subvol = inode->ei_subvol;
+ dio->op.subvol = inode->ei_inum.subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
dio->op.devs_need_flush = &inode->ei_devs_need_flush;
diff --git a/libbcachefs/fs-io-pagecache.c b/libbcachefs/fs-io-pagecache.c
index a9cc5cad..af3a2454 100644
--- a/libbcachefs/fs-io-pagecache.c
+++ b/libbcachefs/fs-io-pagecache.c
@@ -182,18 +182,11 @@ static void __bch2_folio_set(struct folio *folio,
int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
struct folio **fs, unsigned nr_folios)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_folio *s;
u64 offset = folio_sector(fs[0]);
- unsigned folio_idx;
- u32 snapshot;
bool need_set = false;
- int ret;
- for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
- s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
+ for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+ struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
if (!s)
return -ENOMEM;
@@ -203,53 +196,40 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
if (!need_set)
return 0;
- folio_idx = 0;
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_slots, k, ret) {
- unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = bkey_to_sector_state(k);
-
- while (folio_idx < nr_folios) {
- struct folio *folio = fs[folio_idx];
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
- folio_start;
- unsigned folio_len = min(k.k->p.offset, folio_end) -
- folio_offset - folio_start;
-
- BUG_ON(k.k->p.offset < folio_start);
- BUG_ON(bkey_start_offset(k.k) > folio_end);
-
- if (!bch2_folio(folio)->uptodate)
- __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
- if (k.k->p.offset < folio_end)
- break;
- folio_idx++;
- }
-
- if (folio_idx == nr_folios)
- break;
- }
-
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- bch2_trans_put(trans);
+ unsigned folio_idx = 0;
+
+ return bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+ POS(inum.inum, offset),
+ POS(inum.inum, U64_MAX),
+ inum.subvol, BTREE_ITER_slots, k, ({
+ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+ unsigned state = bkey_to_sector_state(k);
+
+ while (folio_idx < nr_folios) {
+ struct folio *folio = fs[folio_idx];
+ u64 folio_start = folio_sector(folio);
+ u64 folio_end = folio_end_sector(folio);
+ unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
+ folio_start;
+ unsigned folio_len = min(k.k->p.offset, folio_end) -
+ folio_offset - folio_start;
+
+ BUG_ON(k.k->p.offset < folio_start);
+ BUG_ON(bkey_start_offset(k.k) > folio_end);
+
+ if (!bch2_folio(folio)->uptodate)
+ __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
+
+ if (k.k->p.offset < folio_end)
+ break;
+ folio_idx++;
+ }
- return ret;
+ if (folio_idx == nr_folios)
+ break;
+ 0;
+ })));
}
void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 77b85da3..71d0fa38 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -221,30 +221,11 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
struct bpos start,
struct bpos end)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
- if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
- ret = 1;
- break;
- }
- start = iter.pos;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
- return ret;
+ return bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end,
+ subvol, 0, k, ({
+ bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
+ })));
}
static int __bch2_truncate_folio(struct bch_inode_info *inode,
@@ -267,7 +248,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
* XXX: we're doing two index lookups when we end up reading the
* folio
*/
- ret = range_has_data(c, inode->ei_subvol,
+ ret = range_has_data(c, inode->ei_inum.subvol,
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
if (ret <= 0)
@@ -618,7 +599,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans,
- inode->ei_subvol, &snapshot);
+ inode->ei_inum.subvol, &snapshot);
if (ret)
goto bkey_err;
@@ -813,41 +794,23 @@ static int quota_reserve_range(struct bch_inode_info *inode,
u64 start, u64 end)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- u32 snapshot;
u64 sectors = end - start;
- u64 pos = start;
- int ret;
-retry:
- bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, pos, snapshot), 0);
-
- while (!(ret = btree_trans_too_many_iters(trans)) &&
- (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
- !(ret = bkey_err(k))) {
- if (bkey_extent_is_allocation(k.k)) {
- u64 s = min(end, k.k->p.offset) -
- max(start, bkey_start_offset(k.k));
- BUG_ON(s > sectors);
- sectors -= s;
- }
- bch2_btree_iter_advance(&iter);
- }
- pos = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter,
+ BTREE_ID_extents,
+ POS(inode->v.i_ino, start),
+ POS(inode->v.i_ino, end - 1),
+ inode->ei_inum.subvol, 0, k, ({
+ if (bkey_extent_is_allocation(k.k)) {
+ u64 s = min(end, k.k->p.offset) -
+ max(start, bkey_start_offset(k.k));
+ BUG_ON(s > sectors);
+ sectors -= s;
+ }
+
+ 0;
+ })));
return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
}
@@ -942,42 +905,25 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
u64 isize, next_data = MAX_LFS_FILESIZE;
- u32 snapshot;
- int ret;
isize = i_size_read(&inode->v);
if (offset >= isize)
return -ENXIO;
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, offset >> 9, snapshot),
- POS(inode->v.i_ino, U64_MAX),
- 0, k, ret) {
- if (bkey_extent_is_data(k.k)) {
- next_data = max(offset, bkey_start_offset(k.k) << 9);
- break;
- } else if (k.k->p.offset >> 9 > isize)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+ POS(inode->v.i_ino, offset >> 9),
+ POS(inode->v.i_ino, U64_MAX),
+ inum.subvol, 0, k, ({
+ if (bkey_extent_is_data(k.k)) {
+ next_data = max(offset, bkey_start_offset(k.k) << 9);
+ break;
+ } else if (k.k->p.offset >> 9 > isize)
+ break;
+ 0;
+ })));
if (ret)
return ret;
@@ -995,50 +941,34 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
u64 isize, next_hole = MAX_LFS_FILESIZE;
- u32 snapshot;
- int ret;
isize = i_size_read(&inode->v);
if (offset >= isize)
return -ENXIO;
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, offset >> 9, snapshot),
- BTREE_ITER_slots, k, ret) {
- if (k.k->p.inode != inode->v.i_ino) {
- next_hole = bch2_seek_pagecache_hole(&inode->v,
- offset, MAX_LFS_FILESIZE, 0, false);
- break;
- } else if (!bkey_extent_is_data(k.k)) {
- next_hole = bch2_seek_pagecache_hole(&inode->v,
- max(offset, bkey_start_offset(k.k) << 9),
- k.k->p.offset << 9, 0, false);
-
- if (next_hole < k.k->p.offset << 9)
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+ POS(inode->v.i_ino, offset >> 9),
+ POS(inode->v.i_ino, U64_MAX),
+ inum.subvol, BTREE_ITER_slots, k, ({
+ if (k.k->p.inode != inode->v.i_ino) {
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
+ offset, MAX_LFS_FILESIZE, 0, false);
break;
- } else {
- offset = max(offset, bkey_start_offset(k.k) << 9);
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ } else if (!bkey_extent_is_data(k.k)) {
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
+ max(offset, bkey_start_offset(k.k) << 9),
+ k.k->p.offset << 9, 0, false);
+
+ if (next_hole < k.k->p.offset << 9)
+ break;
+ } else {
+ offset = max(offset, bkey_start_offset(k.k) << 9);
+ }
+ 0;
+ })));
if (ret)
return ret;
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index aea8132d..f5dfb7e4 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -100,7 +100,7 @@ static int bch2_ioc_setflags(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
@@ -184,7 +184,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_set_projid(c, inode, fa.fsx_projid) ?:
bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
ATTR_CTIME);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index bdd9183b..c50457ba 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -152,42 +152,80 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
return ret;
}
-static int bch2_iget5_test(struct inode *vinode, void *p)
+static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- subvol_inum *inum = p;
+ return a.subvol == b.subvol && a.inum == b.inum;
+}
+
+static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const struct bch_inode_info *inode = obj;
+ const subvol_inum *v = arg->key;
- return inode->ei_subvol == inum->subvol &&
- inode->ei_inode.bi_inum == inum->inum;
+ return !subvol_inum_eq(inode->ei_inum, *v);
}
-static int bch2_iget5_set(struct inode *vinode, void *p)
+static const struct rhashtable_params bch2_vfs_inodes_params = {
+ .head_offset = offsetof(struct bch_inode_info, hash),
+ .key_offset = offsetof(struct bch_inode_info, ei_inum),
+ .key_len = sizeof(subvol_inum),
+ .obj_cmpfn = bch2_vfs_inode_cmp_fn,
+ .automatic_shrinking = true,
+};
+
+static void __wait_on_freeing_inode(struct inode *inode)
{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- subvol_inum *inum = p;
+ wait_queue_head_t *wq;
+ DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+ wq = bit_waitqueue(&inode->i_state, __I_NEW);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+}
- inode->v.i_ino = inum->inum;
- inode->ei_subvol = inum->subvol;
- inode->ei_inode.bi_inum = inum->inum;
- return 0;
+static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+{
+ struct bch_inode_info *inode;
+repeat:
+ inode = rhashtable_lookup_fast(&c->vfs_inodes_table, &inum,
+ bch2_vfs_inodes_params);
+ if (inode) {
+ spin_lock(&inode->v.i_lock);
+ if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
+ __wait_on_freeing_inode(&inode->v);
+ goto repeat;
+ }
+ __iget(&inode->v);
+ spin_unlock(&inode->v.i_lock);
+ }
+
+ return inode;
}
-static unsigned bch2_inode_hash(subvol_inum inum)
+static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
{
- return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+ if (test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
+ int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
+ &inode->hash, bch2_vfs_inodes_params);
+ BUG_ON(ret);
+ clear_bit(EI_INODE_HASHED, &inode->ei_flags);
+ inode->v.i_hash.pprev = NULL;
+ }
}
-static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
+static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, struct bch_inode_info *inode)
{
- subvol_inum inum = inode_inum(inode);
- struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- bch2_iget5_set,
- &inum));
- BUG_ON(!old);
+ struct bch_inode_info *old = inode;
+retry:
+ if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
+ &inode->hash,
+ bch2_vfs_inodes_params))) {
+ old = bch2_inode_hash_find(c, inode->ei_inum);
+ if (!old)
+ goto retry;
- if (unlikely(old != inode)) {
/*
* bcachefs doesn't use I_NEW; we have no use for it since we
* only insert fully created inodes in the inode hash table. But
@@ -203,16 +241,14 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
discard_new_inode(&inode->v);
inode = old;
} else {
+ set_bit(EI_INODE_HASHED, &inode->ei_flags);
+ inode_fake_hash(&inode->v);
+
+ inode_sb_list_add(&inode->v);
+
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
- /*
- * Again, I_NEW makes no sense for bcachefs. This is only needed
- * for clearing I_NEW, but since the inode was already fully
- * created and initialized we didn't actually want
- * inode_insert5() to set it for us.
- */
- unlock_new_inode(&inode->v);
}
return inode;
@@ -245,7 +281,6 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
inode->ei_flags = 0;
mutex_init(&inode->ei_quota_lock);
memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
- inode->v.i_state = 0;
if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
kmem_cache_free(bch2_inode_cache, inode);
@@ -279,11 +314,7 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
{
- struct bch_inode_info *inode =
- to_bch_ei(ilookup5_nowait(c->vfs_sb,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- &inum));
+ struct bch_inode_info *inode = bch2_inode_hash_find(c, inum);
if (inode)
return &inode->v;
@@ -297,7 +328,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
if (!ret) {
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
- inode = bch2_inode_insert(c, inode);
+ inode = bch2_inode_hash_insert(c, inode);
}
bch2_trans_put(trans);
@@ -345,7 +376,7 @@ __bch2_create(struct mnt_idmap *idmap,
retry:
bch2_trans_begin(trans);
- ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
+ ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
bch2_create_trans(trans,
inode_inum(dir), &dir_u, &inode_u,
!(flags & BCH_CREATE_TMPFILE)
@@ -359,7 +390,7 @@ retry:
if (unlikely(ret))
goto err_before_quota;
- inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+ inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
inum.inum = inode_u.bi_inum;
ret = bch2_subvolume_get(trans, inum.subvol, true,
@@ -390,7 +421,7 @@ err_before_quota:
* bch2_trans_exit() and dropping locks, else we could race with another
* thread pulling the inode in and modifying it:
*/
- inode = bch2_inode_insert(c, inode);
+ inode = bch2_inode_hash_insert(c, inode);
bch2_trans_put(trans);
err:
posix_acl_release(default_acl);
@@ -430,11 +461,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
if (ret)
goto err;
- struct bch_inode_info *inode =
- to_bch_ei(ilookup5_nowait(c->vfs_sb,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- &inum));
+ struct bch_inode_info *inode = bch2_inode_hash_find(c, inum);
if (inode)
goto out;
@@ -464,7 +491,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
}
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
- inode = bch2_inode_insert(c, inode);
+ inode = bch2_inode_hash_insert(c, inode);
out:
bch2_trans_iter_exit(trans, &dirent_iter);
printbuf_exit(&buf);
@@ -551,8 +578,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
lockdep_assert_held(&inode->v.i_rwsem);
- ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
- bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
+ bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
__bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
return bch2_err_class(ret);
@@ -608,7 +635,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
struct bch_inode_info *dir= to_bch_ei(vdir);
struct bch_fs *c = dir->v.i_sb->s_fs_info;
- int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+ int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
__bch2_unlink(vdir, dentry, false);
return bch2_err_class(ret);
}
@@ -691,8 +718,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
trans = bch2_trans_get(c);
- ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
- bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
+ ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
+ bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
if (ret)
goto err;
@@ -893,7 +920,7 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->blksize = block_bytes(c);
stat->blocks = inode->v.i_blocks;
- stat->subvol = inode->ei_subvol;
+ stat->subvol = inode->ei_inum.subvol;
stat->result_mask |= STATX_SUBVOL;
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
@@ -935,7 +962,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
lockdep_assert_held(&inode->v.i_rwsem);
- ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
setattr_prepare(idmap, dentry, iattr);
if (ret)
return ret;
@@ -1028,7 +1055,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct bkey_buf cur, prev;
unsigned offset_into_extent, sectors;
bool have_extent = false;
- u32 snapshot;
int ret = 0;
ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
@@ -1044,21 +1070,30 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_bkey_buf_init(&cur);
bch2_bkey_buf_init(&prev);
trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
- if (ret)
- goto err;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(ei->v.i_ino, start, snapshot), 0);
+ POS(ei->v.i_ino, start), 0);
- while (!(ret = btree_trans_too_many_iters(trans)) &&
- (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
- !(ret = bkey_err(k))) {
+ while (true) {
enum btree_id data_btree = BTREE_ID_extents;
+ bch2_trans_begin(trans);
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+ k = bch2_btree_iter_peek_upto(&iter, end);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k)
+ break;
+
if (!bkey_extent_is_data(k.k) &&
k.k->type != KEY_TYPE_reservation) {
bch2_btree_iter_advance(&iter);
@@ -1102,16 +1137,12 @@ retry:
bch2_btree_iter_set_pos(&iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
-
- ret = bch2_trans_relock(trans);
- if (ret)
+err:
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
- start = iter.pos.offset;
bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
if (!ret && have_extent) {
bch2_trans_unlock(trans);
@@ -1167,7 +1198,7 @@ static int bch2_open(struct inode *vinode, struct file *file)
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
+ int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
if (ret)
return ret;
}
@@ -1201,7 +1232,7 @@ static const struct inode_operations bch_file_inode_operations = {
.fiemap = bch2_fiemap,
.listxattr = bch2_xattr_list,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_acl = bch2_get_acl,
+ .get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
};
@@ -1221,7 +1252,7 @@ static const struct inode_operations bch_dir_inode_operations = {
.tmpfile = bch2_tmpfile,
.listxattr = bch2_xattr_list,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_acl = bch2_get_acl,
+ .get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
};
@@ -1243,7 +1274,7 @@ static const struct inode_operations bch_symlink_inode_operations = {
.setattr = bch2_setattr,
.listxattr = bch2_xattr_list,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_acl = bch2_get_acl,
+ .get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
};
@@ -1253,7 +1284,7 @@ static const struct inode_operations bch_special_inode_operations = {
.setattr = bch2_setattr,
.listxattr = bch2_xattr_list,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_acl = bch2_get_acl,
+ .get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
};
@@ -1299,8 +1330,8 @@ static int bcachefs_fid_valid(int fh_len, int fh_type)
static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
{
return (struct bcachefs_fid) {
- .inum = inode->ei_inode.bi_inum,
- .subvol = inode->ei_subvol,
+ .inum = inode->ei_inum.inum,
+ .subvol = inode->ei_inum.subvol,
.gen = inode->ei_inode.bi_generation,
};
}
@@ -1385,7 +1416,7 @@ static struct dentry *bch2_get_parent(struct dentry *child)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
subvol_inum parent_inum = {
.subvol = inode->ei_inode.bi_parent_subvol ?:
- inode->ei_subvol,
+ inode->ei_inum.subvol,
.inum = inode->ei_inode.bi_dir,
};
@@ -1421,7 +1452,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
retry:
bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
if (ret)
goto err;
@@ -1452,8 +1483,7 @@ retry:
if (ret)
goto err;
- if (target.subvol == inode->ei_subvol &&
- target.inum == inode->ei_inode.bi_inum)
+ if (subvol_inum_eq(target, inode->ei_inum))
goto found;
} else {
/*
@@ -1474,8 +1504,7 @@ retry:
if (ret)
continue;
- if (target.subvol == inode->ei_subvol &&
- target.inum == inode->ei_inode.bi_inum)
+ if (subvol_inum_eq(target, inode->ei_inum))
goto found;
}
}
@@ -1512,7 +1541,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *bi,
struct bch_subvolume *subvol)
{
- bch2_iget5_set(&inode->v, &inum);
+ inode->v.i_ino = inum.inum;
+ inode->ei_inum = inum;
+ inode->ei_inode.bi_inum = inum.inum;
bch2_inode_update_after_write(trans, inode, bi, ~0);
inode->v.i_blocks = bi->bi_sectors;
@@ -1524,7 +1555,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
inode->ei_flags = 0;
inode->ei_quota_reserved = 0;
inode->ei_qid = bch_qid(bi);
- inode->ei_subvol = inum.subvol;
if (BCH_SUBVOLUME_SNAP(subvol))
set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
@@ -1592,6 +1622,8 @@ static void bch2_evict_inode(struct inode *vinode)
struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(vinode);
+ bch2_inode_hash_remove(c, inode);
+
truncate_inode_pages_final(&inode->v.i_data);
clear_inode(&inode->v);
@@ -1633,7 +1665,7 @@ again:
mutex_lock(&c->vfs_inodes_lock);
list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
- if (!snapshot_list_has_id(s, inode->ei_subvol))
+ if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
continue;
if (!(inode->v.i_state & I_DONTCACHE) &&
@@ -2121,6 +2153,17 @@ static int bch2_init_fs_context(struct fs_context *fc)
return 0;
}
+void bch2_fs_vfs_exit(struct bch_fs *c)
+{
+ if (c->vfs_inodes_table.tbl)
+ rhashtable_destroy(&c->vfs_inodes_table);
+}
+
+int bch2_fs_vfs_init(struct bch_fs *c)
+{
+ return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params);
+}
+
static struct file_system_type bcache_fs_type = {
.owner = THIS_MODULE,
.name = "bcachefs",
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index c3af7225..f8f88781 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -13,6 +13,9 @@
struct bch_inode_info {
struct inode v;
+ struct rhash_head hash;
+ subvol_inum ei_inum;
+
struct list_head ei_vfs_inode_list;
unsigned long ei_flags;
@@ -24,8 +27,6 @@ struct bch_inode_info {
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
- u32 ei_subvol;
-
/*
* When we've been doing nocow writes we'll need to issue flushes to the
* underlying block devices
@@ -50,10 +51,7 @@ struct bch_inode_info {
static inline subvol_inum inode_inum(struct bch_inode_info *inode)
{
- return (subvol_inum) {
- .subvol = inode->ei_subvol,
- .inum = inode->ei_inode.bi_inum,
- };
+ return inode->ei_inum;
}
/*
@@ -67,6 +65,7 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
* those:
*/
#define EI_INODE_SNAPSHOT 1
+#define EI_INODE_HASHED 2
#define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v)
@@ -187,6 +186,9 @@ int __bch2_unlink(struct inode *, struct dentry *, bool);
void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
+void bch2_fs_vfs_exit(struct bch_fs *);
+int bch2_fs_vfs_init(struct bch_fs *);
+
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
@@ -196,6 +198,10 @@ int bch2_vfs_init(void);
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
snapshot_id_list *s) {}
+
+static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
+static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
+
static inline void bch2_vfs_exit(void) {}
static inline int bch2_vfs_init(void) { return 0; }
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index cc4f0963..9138944c 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -283,6 +283,7 @@ static int reattach_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 inode_snapshot)
{
+ struct bch_fs *c = trans->c;
struct bch_hash_info dir_hash;
struct bch_inode_unpacked lostfound;
char name_buf[20];
@@ -317,7 +318,7 @@ static int reattach_inode(struct btree_trans *trans,
return ret;
}
- dir_hash = bch2_hash_info_init(trans->c, &lostfound);
+ dir_hash = bch2_hash_info_init(c, &lostfound);
name = (struct qstr) QSTR(name_buf);
@@ -330,8 +331,10 @@ static int reattach_inode(struct btree_trans *trans,
inode->bi_subvol ?: inode->bi_inum,
&dir_offset,
STR_HASH_must_create);
- if (ret)
+ if (ret) {
+ bch_err_msg(c, ret, "error creating dirent");
return ret;
+ }
inode->bi_dir = lostfound.bi_inum;
inode->bi_dir_offset = dir_offset;
diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c
index 2cf62977..177ed331 100644
--- a/libbcachefs/io_misc.c
+++ b/libbcachefs/io_misc.c
@@ -126,11 +126,7 @@ err_noprint:
if (closure_nr_remaining(&cl) != 1) {
bch2_trans_unlock_long(trans);
-
- if (closure_sync_timeout(&cl, HZ * 10)) {
- bch2_print_allocator_stuck(c);
- closure_sync(&cl);
- }
+ bch2_wait_on_allocator(c, &cl);
}
return ret;
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index 4531c9ab..ce27ba1f 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -286,7 +286,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
*/
bool promote_full = (failed ||
*read_full ||
- READ_ONCE(c->promote_whole_extents));
+ READ_ONCE(c->opts.promote_whole_extents));
/* data might have to be decompressed in the write path: */
unsigned sectors = promote_full
? max(pick->crc.compressed_size, pick->crc.live_size)
@@ -406,6 +406,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
bch2_trans_iter_init(trans, &iter, rbio->data_btree,
rbio->read_pos, BTREE_ITER_slots);
retry:
+ bch2_trans_begin(trans);
rbio->bio.bi_status = 0;
k = bch2_btree_iter_peek_slot(&iter);
@@ -1213,10 +1214,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-
- ret = btree_trans_too_many_iters(trans);
- if (ret)
- goto err;
err:
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index d31c8d00..1d4761d1 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -1503,10 +1503,7 @@ err:
if ((op->flags & BCH_WRITE_SYNC) ||
(!(op->flags & BCH_WRITE_SUBMITTED) &&
!(op->flags & BCH_WRITE_IN_WORKER))) {
- if (closure_sync_timeout(&op->cl, HZ * 10)) {
- bch2_print_allocator_stuck(c);
- closure_sync(&op->cl);
- }
+ bch2_wait_on_allocator(c, &op->cl);
__bch2_write_index(op);
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index d8a63074..70b998d9 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -206,6 +206,7 @@ void bch2_journal_space_available(struct journal *j)
if (nr_online < metadata_replicas_required(c)) {
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
"rw journal devs:", nr_online, metadata_replicas_required(c));
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 1d4ffd01..3d83bcdc 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -265,6 +265,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Enable inline data extents") \
+ x(promote_whole_extents, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \
+ NULL, "Promote whole extents, instead of just part being read")\
x(acl, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
@@ -393,6 +398,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \
NULL, "Log transaction function names in journal") \
+ x(allocator_stuck_timeout, u16, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(0, U16_MAX), \
+ BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \
+ NULL, "Default timeout in seconds for stuck allocator messages")\
x(noexcl, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c
index 39196f2a..b4ea6490 100644
--- a/libbcachefs/sb-members.c
+++ b/libbcachefs/sb-members.c
@@ -464,3 +464,12 @@ void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
__bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
}
}
+
+unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
+{
+ unsigned nr = 0;
+
+ for (unsigned i = 0; i < sb->nr_devices; i++)
+ nr += bch2_member_exists((struct bch_sb *) sb, i);
+ return nr;
+}
diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h
index dd93192e..f307f285 100644
--- a/libbcachefs/sb-members.h
+++ b/libbcachefs/sb-members.h
@@ -307,6 +307,8 @@ static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
return false;
}
+unsigned bch2_sb_nr_devices(const struct bch_sb *);
+
static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
{
return (struct bch_member_cpu) {
diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h
index afa5e871..03b8c6fc 100644
--- a/libbcachefs/subvolume.h
+++ b/libbcachefs/subvolume.h
@@ -32,6 +32,51 @@ int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
int bch2_subvol_is_ro(struct bch_fs *, u32);
+static inline struct bkey_s_c
+bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end,
+ u32 subvolid, unsigned flags)
+{
+ u32 snapshot;
+ int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
+ if (ret)
+ return bkey_s_c_err(ret);
+
+ bch2_btree_iter_set_snapshot(iter, snapshot);
+ return bch2_btree_iter_peek_upto_type(iter, end, flags);
+}
+
+#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \
+ _end, _subvolid, _flags, _k, _do) \
+({ \
+ struct bkey_s_c _k; \
+ int _ret3 = 0; \
+ \
+ do { \
+ _ret3 = lockrestart_do(_trans, ({ \
+ (_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter), \
+ _end, _subvolid, (_flags)); \
+ if (!(_k).k) \
+ break; \
+ \
+ bkey_err(_k) ?: (_do); \
+ })); \
+ } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret3; \
+})
+
+#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id, \
+ _start, _end, _subvolid, _flags, _k, _do) \
+({ \
+ struct btree_iter _iter; \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \
+ _end, _subvolid, _flags, _k, _do); \
+})
+
int bch2_delete_dead_snapshots(struct bch_fs *);
void bch2_delete_dead_snapshots_async(struct bch_fs *);
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 8bc81983..d86d5dae 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -414,6 +414,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
+
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
+ !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
+ SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
+
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
+ SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
}
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
@@ -1288,15 +1295,9 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
bool print_layout, unsigned fields)
{
- u64 fields_have = 0;
- unsigned nr_devices = 0;
-
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 44);
- for (int i = 0; i < sb->nr_devices; i++)
- nr_devices += bch2_member_exists(sb, i);
-
prt_printf(out, "External UUID:\t");
pr_uuid(out, sb->user_uuid.b);
prt_newline(out);
@@ -1352,9 +1353,10 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_newline(out);
prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
- prt_printf(out, "Devices:\t%u\n", nr_devices);
+ prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb));
prt_printf(out, "Sections:\t");
+ u64 fields_have = 0;
vstruct_for_each(sb, f)
fields_have |= 1 << le32_to_cpu(f->type);
prt_bitflags(out, bch2_sb_fields, fields_have);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 0455a100..d8adf465 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -543,6 +543,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_fs_fs_io_direct_exit(c);
bch2_fs_fs_io_buffered_exit(c);
bch2_fs_fsio_exit(c);
+ bch2_fs_vfs_exit(c);
bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_nocow_locking_exit(c);
@@ -810,7 +811,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->copy_gc_enabled = 1;
c->rebalance.enabled = 1;
- c->promote_whole_extents = true;
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
@@ -926,6 +926,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_encryption_init(c) ?:
bch2_fs_compress_init(c) ?:
bch2_fs_ec_init(c) ?:
+ bch2_fs_vfs_init(c) ?:
bch2_fs_fsio_init(c) ?:
bch2_fs_fs_io_buffered_init(c) ?:
bch2_fs_fs_io_direct_init(c);
@@ -1193,7 +1194,6 @@ static void bch2_dev_free(struct bch_dev *ca)
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
- kfree(ca->buckets_nouse);
bch2_free_super(&ca->disk_sb);
bch2_dev_allocator_background_exit(ca);
bch2_dev_journal_exit(ca);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index d4c5bfca..9f8ca6a5 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -219,7 +219,6 @@ read_attribute(copy_gc_wait);
rw_attribute(rebalance_enabled);
sysfs_pd_controller_attribute(rebalance);
read_attribute(rebalance_status);
-rw_attribute(promote_whole_extents);
read_attribute(new_stripes);
@@ -347,8 +346,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_rebalance_status)
bch2_rebalance_status_to_text(out, c);
- sysfs_print(promote_whole_extents, c->promote_whole_extents);
-
/* Debugging: */
if (attr == &sysfs_journal_debug)
@@ -367,7 +364,7 @@ SHOW(bch2_fs)
bch2_stripes_heap_to_text(out, c);
if (attr == &sysfs_open_buckets)
- bch2_open_buckets_to_text(out, c);
+ bch2_open_buckets_to_text(out, c, NULL);
if (attr == &sysfs_open_buckets_partial)
bch2_open_buckets_partial_to_text(out, c);
@@ -436,8 +433,6 @@ STORE(bch2_fs)
sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
- sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
-
/* Debugging: */
if (!test_bit(BCH_FS_started, &c->flags))
@@ -514,7 +509,7 @@ struct attribute *bch2_fs_files[] = {
&sysfs_btree_cache_size,
&sysfs_btree_write_stats,
- &sysfs_promote_whole_extents,
+ &sysfs_rebalance_status,
&sysfs_compression_stats,
@@ -614,7 +609,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_copy_gc_wait,
&sysfs_rebalance_enabled,
- &sysfs_rebalance_status,
sysfs_pd_controller_files(rebalance),
&sysfs_moving_ctxts,
@@ -811,6 +805,9 @@ SHOW(bch2_dev)
if (attr == &sysfs_alloc_debug)
bch2_dev_alloc_debug_to_text(out, ca);
+ if (attr == &sysfs_open_buckets)
+ bch2_open_buckets_to_text(out, c, ca);
+
return 0;
}
@@ -877,6 +874,7 @@ struct attribute *bch2_dev_files[] = {
/* debug: */
&sysfs_alloc_debug,
+ &sysfs_open_buckets,
NULL
};
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 89fdaf2b..332bc17f 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * random utiility code, for bcache but in theory not specific to bcache
+ * random utility code, for bcache but in theory not specific to bcache
*
* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
* Copyright 2012 Google, Inc.
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index bd9db6bf..fb02c1c3 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -8,6 +8,7 @@
#include <linux/errno.h>
#include <linux/freezer.h>
#include <linux/kernel.h>
+#include <linux/min_heap.h>
#include <linux/sched/clock.h>
#include <linux/llist.h>
#include <linux/log2.h>
@@ -54,17 +55,9 @@ static inline size_t buf_pages(void *p, size_t len)
PAGE_SIZE);
}
-#define HEAP(type) \
-struct { \
- size_t size, used; \
- type *data; \
-}
-
-#define DECLARE_HEAP(type, name) HEAP(type) name
-
#define init_heap(heap, _size, gfp) \
({ \
- (heap)->used = 0; \
+ (heap)->nr = 0; \
(heap)->size = (_size); \
(heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
(gfp)); \
@@ -76,113 +69,6 @@ do { \
(heap)->data = NULL; \
} while (0)
-#define heap_set_backpointer(h, i, _fn) \
-do { \
- void (*fn)(typeof(h), size_t) = _fn; \
- if (fn) \
- fn(h, i); \
-} while (0)
-
-#define heap_swap(h, i, j, set_backpointer) \
-do { \
- swap((h)->data[i], (h)->data[j]); \
- heap_set_backpointer(h, i, set_backpointer); \
- heap_set_backpointer(h, j, set_backpointer); \
-} while (0)
-
-#define heap_peek(h) \
-({ \
- EBUG_ON(!(h)->used); \
- (h)->data[0]; \
-})
-
-#define heap_full(h) ((h)->used == (h)->size)
-
-#define heap_sift_down(h, i, cmp, set_backpointer) \
-do { \
- size_t _c, _j = i; \
- \
- for (; _j * 2 + 1 < (h)->used; _j = _c) { \
- _c = _j * 2 + 1; \
- if (_c + 1 < (h)->used && \
- cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \
- _c++; \
- \
- if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \
- break; \
- heap_swap(h, _c, _j, set_backpointer); \
- } \
-} while (0)
-
-#define heap_sift_up(h, i, cmp, set_backpointer) \
-do { \
- while (i) { \
- size_t p = (i - 1) / 2; \
- if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \
- break; \
- heap_swap(h, i, p, set_backpointer); \
- i = p; \
- } \
-} while (0)
-
-#define __heap_add(h, d, cmp, set_backpointer) \
-({ \
- size_t _i = (h)->used++; \
- (h)->data[_i] = d; \
- heap_set_backpointer(h, _i, set_backpointer); \
- \
- heap_sift_up(h, _i, cmp, set_backpointer); \
- _i; \
-})
-
-#define heap_add(h, d, cmp, set_backpointer) \
-({ \
- bool _r = !heap_full(h); \
- if (_r) \
- __heap_add(h, d, cmp, set_backpointer); \
- _r; \
-})
-
-#define heap_add_or_replace(h, new, cmp, set_backpointer) \
-do { \
- if (!heap_add(h, new, cmp, set_backpointer) && \
- cmp(h, new, heap_peek(h)) >= 0) { \
- (h)->data[0] = new; \
- heap_set_backpointer(h, 0, set_backpointer); \
- heap_sift_down(h, 0, cmp, set_backpointer); \
- } \
-} while (0)
-
-#define heap_del(h, i, cmp, set_backpointer) \
-do { \
- size_t _i = (i); \
- \
- BUG_ON(_i >= (h)->used); \
- (h)->used--; \
- if ((_i) < (h)->used) { \
- heap_swap(h, _i, (h)->used, set_backpointer); \
- heap_sift_up(h, _i, cmp, set_backpointer); \
- heap_sift_down(h, _i, cmp, set_backpointer); \
- } \
-} while (0)
-
-#define heap_pop(h, d, cmp, set_backpointer) \
-({ \
- bool _r = (h)->used; \
- if (_r) { \
- (d) = (h)->data[0]; \
- heap_del(h, 0, cmp, set_backpointer); \
- } \
- _r; \
-})
-
-#define heap_resort(heap, cmp, set_backpointer) \
-do { \
- ssize_t _i; \
- for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \
- heap_sift_down(heap, _i, cmp, set_backpointer); \
-} while (0)
-
#define ANYSINT_MAX(t) \
((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index c11bf6da..780781f5 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -296,54 +296,23 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
struct bch_fs *c = dentry->d_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
u64 offset = 0, inum = inode->ei_inode.bi_inum;
- u32 snapshot;
- int ret;
-retry:
- bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
- ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
- SPOS(inum, offset, snapshot),
- POS(inum, U64_MAX), 0, k, ret) {
- if (k.k->type != KEY_TYPE_xattr)
- continue;
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs,
+ POS(inum, offset),
+ POS(inum, U64_MAX),
+ inode->ei_inum.subvol, 0, k, ({
+ if (k.k->type != KEY_TYPE_xattr)
+ continue;
- ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
- if (ret)
- break;
- }
+ bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
+ }))) ?:
+ bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?:
+ bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
-
- if (ret)
- goto out;
-
- ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
- if (ret)
- goto out;
-
- ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
- if (ret)
- goto out;
-
- return buf.used;
-out:
- return bch2_err_class(ret);
+ return ret ? bch2_err_class(ret) : buf.used;
}
static int bch2_xattr_get_handler(const struct xattr_handler *handler,