summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--Makefile52
-rw-r--r--cmd_fs.c14
-rw-r--r--cmd_fsck.c71
-rw-r--r--cmd_fusemount.c68
-rw-r--r--debian/bcachefs-tools.postinst2
-rw-r--r--debian/bcachefs-tools.postrm2
-rw-r--r--debian/control4
-rwxr-xr-xdebian/rules2
-rw-r--r--fsck/.gitignore1
-rw-r--r--fsck/bcachefsck@.service.in98
-rw-r--r--fsck/bcachefsck_all.in481
-rw-r--r--fsck/bcachefsck_all.service.in84
-rw-r--r--fsck/bcachefsck_all.timer16
-rw-r--r--fsck/bcachefsck_all_fail.service.in71
-rwxr-xr-xfsck/bcachefsck_fail63
-rw-r--r--fsck/bcachefsck_fail@.service.in75
-rw-r--r--fsck/system-bcachefsck.slice30
-rw-r--r--libbcachefs/bcachefs.h53
-rw-r--r--libbcachefs/bcachefs_ioctl.h24
-rw-r--r--libbcachefs/btree_cache.c30
-rw-r--r--libbcachefs/btree_cache.h4
-rw-r--r--libbcachefs/btree_io.c15
-rw-r--r--libbcachefs/btree_io.h5
-rw-r--r--libbcachefs/btree_iter.c27
-rw-r--r--libbcachefs/btree_iter.h23
-rw-r--r--libbcachefs/btree_key_cache.c2
-rw-r--r--libbcachefs/btree_locking.h2
-rw-r--r--libbcachefs/btree_types.h2
-rw-r--r--libbcachefs/btree_update.c16
-rw-r--r--libbcachefs/btree_update_interior.c46
-rw-r--r--libbcachefs/buckets.c200
-rw-r--r--libbcachefs/buckets.h4
-rw-r--r--libbcachefs/chardev.c342
-rw-r--r--libbcachefs/compress.c4
-rw-r--r--libbcachefs/data_update.c4
-rw-r--r--libbcachefs/errcode.h1
-rw-r--r--libbcachefs/fs-ioctl.c16
-rw-r--r--libbcachefs/opts.c4
-rw-r--r--libbcachefs/opts.h5
-rw-r--r--libbcachefs/recovery.c16
-rw-r--r--libbcachefs/reflink.c209
-rw-r--r--libbcachefs/reflink.h4
-rw-r--r--libbcachefs/super.c21
-rw-r--r--libbcachefs/sysfs.c8
-rw-r--r--libbcachefs/trace.h104
46 files changed, 1884 insertions, 443 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index b0f07a3d..393a80ed 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-71a5b27e017df6ebae391da58857b22fdc406276
+6d44812757ddf81fad087d6abe662355e6712e02
diff --git a/Makefile b/Makefile
index 42692673..a7926f46 100644
--- a/Makefile
+++ b/Makefile
@@ -29,7 +29,7 @@ CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC \
-D_LGPL_SOURCE \
-DRCU_MEMBARRIER \
-DZSTD_STATIC_LINKING_ONLY \
- -DFUSE_USE_VERSION=32 \
+ -DFUSE_USE_VERSION=35 \
-DNO_BCACHEFS_CHARDEV \
-DNO_BCACHEFS_FS \
-DNO_BCACHEFS_SYSFS \
@@ -91,9 +91,47 @@ else
ROOT_SBINDIR?=$(PREFIX)/sbin
INITRAMFS_DIR=/etc/initramfs-tools
endif
+LIBDIR=$(PREFIX)/lib
+
+PKGCONFIG_SERVICEDIR:=$(shell $(PKG_CONFIG) --variable=systemdsystemunitdir systemd)
+ifeq (,$(PKGCONFIG_SERVICEDIR))
+ $(warning skipping systemd integration)
+else
+BCACHEFSCK_ARGS=-f -n
+systemd_libfiles=\
+ fsck/bcachefsck_fail \
+ fsck/bcachefsck_all
+
+systemd_services=\
+ fsck/bcachefsck_fail@.service \
+ fsck/bcachefsck@.service \
+ fsck/system-bcachefsck.slice \
+ fsck/bcachefsck_all_fail.service \
+ fsck/bcachefsck_all.service \
+ fsck/bcachefsck_all.timer
+
+built_scripts+=\
+ fsck/bcachefsck_fail@.service \
+ fsck/bcachefsck@.service \
+ fsck/bcachefsck_all_fail.service \
+ fsck/bcachefsck_all \
+ fsck/bcachefsck_all.service
+
+%.service: %.service.in
+ @echo " [SED] $@"
+ $(Q)sed -e "s|@libdir@|$(LIBDIR)|g" \
+ -e "s|@bcachefsck_args@|$(BCACHEFSCK_ARGS)|g" < $< > $@
+
+fsck/bcachefsck_all: fsck/bcachefsck_all.in
+ @echo " [SED] $@"
+ $(Q)sed -e "s|@bcachefsck_args@|$(BCACHEFSCK_ARGS)|g" < $< > $@
+
+optional_build+=$(systemd_libfiles) $(systemd_services)
+optional_install+=install_systemd
+endif # PKGCONFIG_SERVICEDIR
.PHONY: all
-all: bcachefs
+all: bcachefs $(optional_build)
.PHONY: debug
debug: CFLAGS+=-Werror -DCONFIG_BCACHEFS_DEBUG=y -DCONFIG_VALGRIND=y
@@ -157,7 +195,7 @@ cmd_version.o : .version
.PHONY: install
install: INITRAMFS_HOOK=$(INITRAMFS_DIR)/hooks/bcachefs
install: INITRAMFS_SCRIPT=$(INITRAMFS_DIR)/scripts/local-premount/bcachefs
-install: bcachefs
+install: bcachefs $(optional_install)
$(INSTALL) -m0755 -D bcachefs -t $(DESTDIR)$(ROOT_SBINDIR)
$(INSTALL) -m0644 -D bcachefs.8 -t $(DESTDIR)$(PREFIX)/share/man/man8/
$(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_SCRIPT)
@@ -173,11 +211,17 @@ install: bcachefs
sed -i '/^# Note: make install replaces/,$$d' $(DESTDIR)$(INITRAMFS_HOOK)
echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK)
+.PHONY: install_systemd
+install_systemd: $(systemd_services) $(systemd_libfiles)
+ $(INSTALL) -m0755 -D $(systemd_libfiles) -t $(DESTDIR)$(LIBDIR)
+ $(INSTALL) -m0644 -D $(systemd_services) -t $(DESTDIR)$(PKGCONFIG_SERVICEDIR)
+
.PHONY: clean
clean:
@echo "Cleaning all"
$(Q)$(RM) bcachefs libbcachefs.a tests/test_helper .version *.tar.xz $(OBJS) $(DEPS) $(DOCGENERATED)
$(Q)$(RM) -rf rust-src/*/target
+ $(Q)$(RM) -f $(built_scripts)
.PHONY: deb
deb: all
@@ -224,7 +268,7 @@ update-bcachefs-sources:
git add include/linux/kmemleak.h
cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/
git add linux/int_sqrt.c
- rm libbcachefs/mean_and_variance_test.c
+ git rm libbcachefs/mean_and_variance_test.c
# cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
# git add linux/mean_and_variance.c
# cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
diff --git a/cmd_fs.c b/cmd_fs.c
index a828f39b..b5c8ab3a 100644
--- a/cmd_fs.c
+++ b/cmd_fs.c
@@ -39,10 +39,22 @@ static void dev_usage_type_to_text(struct printbuf *out,
struct bch_ioctl_dev_usage_v2 *u,
enum bch_data_type type)
{
+ u64 sectors = 0;
+ switch (type) {
+ case BCH_DATA_free:
+ case BCH_DATA_need_discard:
+ case BCH_DATA_need_gc_gens:
+ /* sectors are 0 for these types so calculate sectors for them */
+ sectors = u->d[type].buckets * u->bucket_size;
+ break;
+ default:
+ sectors = u->d[type].sectors;
+ }
+
__dev_usage_type_to_text(out, bch2_data_types[type],
u->bucket_size,
u->d[type].buckets,
- u->d[type].sectors,
+ sectors,
u->d[type].fragmented);
}
diff --git a/cmd_fsck.c b/cmd_fsck.c
index f7dcae98..0c9663d3 100644
--- a/cmd_fsck.c
+++ b/cmd_fsck.c
@@ -1,5 +1,7 @@
#include <getopt.h>
+#include <sys/uio.h>
+#include <unistd.h>
#include "cmds.h"
#include "libbcachefs/error.h"
#include "libbcachefs.h"
@@ -23,6 +25,62 @@ static void usage(void)
"Report bugs to <linux-bcachefs@vger.kernel.org>");
}
+static void setnonblocking(int fd)
+{
+ int flags = fcntl(fd, F_GETFL);
+ if (fcntl(fd, F_SETFL, flags|O_NONBLOCK))
+ die("fcntl error: %m");
+}
+
+static int do_splice(int rfd, int wfd)
+{
+ char buf[4096];
+
+ int r = read(rfd, buf, sizeof(buf));
+ if (r < 0 && errno == EAGAIN)
+ return 0;
+ if (r < 0)
+ return r;
+ if (!r)
+ return 1;
+ if (write(wfd, buf, r) != r)
+ die("write error");
+ return 0;
+}
+
+static int fsck_online(const char *dev_path)
+{
+ int dev_idx;
+ struct bchfs_handle fs = bchu_fs_open_by_dev(dev_path, &dev_idx);
+
+ struct bch_ioctl_fsck_online fsck = { 0 };
+
+ int fsck_fd = ioctl(fs.ioctl_fd, BCH_IOCTL_FSCK_ONLINE, &fsck);
+ if (fsck_fd < 0)
+ die("BCH_IOCTL_FSCK_ONLINE error: %s", bch2_err_str(fsck_fd));
+
+ setnonblocking(STDIN_FILENO);
+ setnonblocking(fsck_fd);
+
+ while (true) {
+ fd_set fds;
+
+ FD_ZERO(&fds);
+ FD_SET(STDIN_FILENO, &fds);
+ FD_SET(fsck_fd, &fds);
+
+ select(fsck_fd + 1, &fds, NULL, NULL, NULL);
+
+ int r = do_splice(fsck_fd, STDOUT_FILENO) ?:
+ do_splice(STDIN_FILENO, fsck_fd);
+ if (r)
+ return r < 0 ? r : 0;
+ }
+
+ pr_info("done");
+ return 0;
+}
+
int cmd_fsck(int argc, char *argv[])
{
static const struct option longopts[] = {
@@ -80,16 +138,9 @@ int cmd_fsck(int argc, char *argv[])
exit(8);
}
- for (i = 0; i < argc; i++) {
- switch (dev_mounted(argv[i])) {
- case 1:
- ret |= 2;
- break;
- case 2:
- fprintf(stderr, "%s is mounted read-write - aborting\n", argv[i]);
- exit(8);
- }
- }
+ for (i = 0; i < argc; i++)
+ if (dev_mounted(argv[i]))
+ return fsck_online(argv[i]);
struct bch_fs *c = bch2_fs_open(argv, argc, opts);
if (IS_ERR(c)) {
diff --git a/cmd_fusemount.c b/cmd_fusemount.c
index a09d296c..d81f3188 100644
--- a/cmd_fusemount.c
+++ b/cmd_fusemount.c
@@ -34,6 +34,15 @@
/* XXX cut and pasted from fsck.c */
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+/* used by write_aligned function for waiting on bch2_write closure */
+struct write_aligned_op_t {
+ struct closure cl;
+
+ /* must be last: */
+ struct bch_write_op op;
+};
+
+
static inline subvol_inum map_root_ino(u64 ino)
{
return (subvol_inum) { 1, ino == 1 ? 4096 : ino };
@@ -343,7 +352,7 @@ static void bcachefs_fuse_link(fuse_req_t req, fuse_ino_t ino,
int ret;
fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_link(%llu, %llu, %s)\n",
- inum, newparent.inum, newname);
+ inum.inum, newparent.inum, newname);
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_link_trans(trans, newparent, &dir_u,
@@ -392,6 +401,14 @@ static void bcachefs_fuse_read_endio(struct bio *bio)
closure_put(bio->bi_private);
}
+
+static void bcachefs_fuse_write_endio(struct bch_write_op *op)
+{
+ struct write_aligned_op_t *w = container_of(op,struct write_aligned_op_t,op);
+ closure_put(&w->cl);
+}
+
+
struct fuse_align_io {
off_t start;
size_t pad_start;
@@ -554,41 +571,47 @@ static int write_aligned(struct bch_fs *c, subvol_inum inum,
size_t aligned_size, off_t aligned_offset,
off_t new_i_size, size_t *written_out)
{
- struct bch_write_op op = { 0 };
+
+ struct write_aligned_op_t w = { 0 }
+;
+ struct bch_write_op *op = &w.op;
struct bio_vec bv;
- struct closure cl;
BUG_ON(aligned_size & (block_bytes(c) - 1));
BUG_ON(aligned_offset & (block_bytes(c) - 1));
*written_out = 0;
- closure_init_stack(&cl);
+ closure_init_stack(&w.cl);
- bch2_write_op_init(&op, c, io_opts); /* XXX reads from op?! */
- op.write_point = writepoint_hashed(0);
- op.nr_replicas = io_opts.data_replicas;
- op.target = io_opts.foreground_target;
- op.subvol = inum.subvol;
- op.pos = POS(inum.inum, aligned_offset >> 9);
- op.new_i_size = new_i_size;
+ bch2_write_op_init(op, c, io_opts); /* XXX reads from op?! */
+ op->write_point = writepoint_hashed(0);
+ op->nr_replicas = io_opts.data_replicas;
+ op->target = io_opts.foreground_target;
+ op->subvol = inum.subvol;
+ op->pos = POS(inum.inum, aligned_offset >> 9);
+ op->new_i_size = new_i_size;
+ op->end_io = bcachefs_fuse_write_endio;
- userbio_init(&op.wbio.bio, &bv, buf, aligned_size);
- bio_set_op_attrs(&op.wbio.bio, REQ_OP_WRITE, REQ_SYNC);
+ userbio_init(&op->wbio.bio, &bv, buf, aligned_size);
+ bio_set_op_attrs(&op->wbio.bio, REQ_OP_WRITE, REQ_SYNC);
- if (bch2_disk_reservation_get(c, &op.res, aligned_size >> 9,
- op.nr_replicas, 0)) {
+ if (bch2_disk_reservation_get(c, &op->res, aligned_size >> 9,
+ op->nr_replicas, 0)) {
/* XXX: use check_range_allocated like dio write path */
return -ENOSPC;
}
- closure_call(&op.cl, bch2_write, NULL, &cl);
- closure_sync(&cl);
+ closure_get(&w.cl);
- if (!op.error)
- *written_out = op.written << 9;
+ closure_call(&op->cl, bch2_write, NULL, NULL);
- return op.error;
+ closure_sync(&w.cl);
+
+ if (!op->error)
+ *written_out = op->written << 9;
+
+ return op->error;
}
static void bcachefs_fuse_write(fuse_req_t req, fuse_ino_t ino,
@@ -1255,6 +1278,11 @@ int cmd_fusemount(int argc, char *argv[])
/* This print statement is a trigger for tests. */
printf("Fuse mount initialized.\n");
+ if (fuse_opts.foreground == 0){
+ printf("Fuse forcing to foreground mode, due gcc constructors usage.\n");
+ fuse_opts.foreground = 1;
+ }
+
fuse_daemonize(fuse_opts.foreground);
ret = fuse_session_loop(se);
diff --git a/debian/bcachefs-tools.postinst b/debian/bcachefs-tools.postinst
index 483b9619..56dd8905 100644
--- a/debian/bcachefs-tools.postinst
+++ b/debian/bcachefs-tools.postinst
@@ -2,6 +2,8 @@
set -e
+#DEBHELPER#
+
case "$1" in
configure)
if which update-initramfs >/dev/null; then
diff --git a/debian/bcachefs-tools.postrm b/debian/bcachefs-tools.postrm
index 6b6fe8ac..2d913367 100644
--- a/debian/bcachefs-tools.postrm
+++ b/debian/bcachefs-tools.postrm
@@ -2,6 +2,8 @@
set -e
+#DEBHELPER#
+
case "$1" in
remove)
if which update-initramfs >/dev/null; then
diff --git a/debian/control b/debian/control
index 2e7c86b2..e68bd7ab 100644
--- a/debian/control
+++ b/debian/control
@@ -6,12 +6,12 @@ Standards-Version: 3.9.5
Build-Depends: debhelper (>= 9), pkg-config, libaio-dev, libblkid-dev,
libkeyutils-dev, liblz4-dev, libsodium-dev, liburcu-dev, libudev-dev,
libzstd-dev, uuid-dev, zlib1g-dev, python3, python3-docutils,
- rustc, cargo, llvm, clang, libclang-dev
+ rustc, cargo, llvm, clang, libclang-dev, systemd
Homepage: https://bcachefs.org/
Package: bcachefs-tools
Architecture: linux-any
-Depends: ${shlibs:Depends}, ${misc:Depends}
+Depends: ${shlibs:Depends}, ${misc:Depends}, ${python3:Depends}
Recommends: initramfs-tools | linux-initramfs-tool
Description: bcachefs userspace tools
Userspace tools for bcachefs, a modern copy on write, checksumming, multi
diff --git a/debian/rules b/debian/rules
index ae98f5ce..7a713efb 100755
--- a/debian/rules
+++ b/debian/rules
@@ -3,7 +3,7 @@
PREFIX := /usr
%:
- dh $@
+ dh --with python3 $@
override_dh_auto_install:
dh_auto_install -- "PREFIX=$(PREFIX)"
diff --git a/fsck/.gitignore b/fsck/.gitignore
new file mode 100644
index 00000000..0e3ad1b0
--- /dev/null
+++ b/fsck/.gitignore
@@ -0,0 +1 @@
+*.service
diff --git a/fsck/bcachefsck@.service.in b/fsck/bcachefsck@.service.in
new file mode 100644
index 00000000..86c1824c
--- /dev/null
+++ b/fsck/bcachefsck@.service.in
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle. All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Online bcachefsck for %f
+OnFailure=bcachefsck@%i.service
+Documentation=man:bcachefs(8)
+
+# Explicitly require the capabilities that this program needs
+ConditionCapability=CAP_SYS_ADMIN
+ConditionCapability=CAP_FOWNER
+ConditionCapability=CAP_DAC_OVERRIDE
+ConditionCapability=CAP_DAC_READ_SEARCH
+ConditionCapability=CAP_SYS_RAWIO
+
+# Must be a mountpoint
+ConditionPathIsMountPoint=%f
+RequiresMountsFor=%f
+
+[Service]
+Type=oneshot
+Environment=SERVICE_MODE=1
+ExecStart=bcachefs fsck --real-mountpoint /tmp/scrub/ @bcachefsck_args@ %f
+SyslogIdentifier=%N
+
+# Run scrub with minimal CPU and IO priority so that nothing else will starve.
+IOSchedulingClass=idle
+CPUSchedulingPolicy=idle
+CPUAccounting=true
+Nice=19
+
+# Create the service underneath the background service slice so that we can
+# control resource usage.
+Slice=system-bcachefsck.slice
+
+# No realtime CPU scheduling
+RestrictRealtime=true
+
+# Dynamically create a user that isn't root
+DynamicUser=true
+
+# Make the entire filesystem readonly and /home inaccessible, then bind mount
+# the filesystem we're supposed to be checking into our private /tmp dir.
+# 'norbind' means that we don't bind anything under that original mount.
+# This enables checking filesystems mounted under /tmp in the global mount
+# namespace.
+ProtectSystem=strict
+ProtectHome=yes
+PrivateTmp=true
+BindPaths=%f:/tmp/scrub:norbind
+
+# No network access
+PrivateNetwork=true
+ProtectHostname=true
+RestrictAddressFamilies=none
+IPAddressDeny=any
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+
+# Hide everything in /proc, even /proc/mounts
+ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+SystemCallFilter=~@mount
+
+# bcachefsck needs these privileges to run, and no others
+CapabilityBoundingSet=CAP_SYS_ADMIN CAP_FOWNER CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_SYS_RAWIO
+AmbientCapabilities=CAP_SYS_ADMIN CAP_FOWNER CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_SYS_RAWIO
+NoNewPrivileges=true
+
+# bcachefsck doesn't create files
+UMask=7777
+
+# No access to hardware /dev files except for block devices
+ProtectClock=true
+DevicePolicy=closed
+DeviceAllow=block-*
diff --git a/fsck/bcachefsck_all.in b/fsck/bcachefsck_all.in
new file mode 100644
index 00000000..4f6031eb
--- /dev/null
+++ b/fsck/bcachefsck_all.in
@@ -0,0 +1,481 @@
+#!/usr/bin/python3
+
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (C) 2023-2024 Oracle. All rights reserved.
+#
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+# Run bcachefsck in parallel, but avoid thrashing.
+
+import subprocess
+import json
+import threading
+import time
+import sys
+import os
+import argparse
+import signal
+import dbus
+from io import TextIOWrapper
+from pathlib import Path
+from datetime import timedelta
+from datetime import datetime
+from datetime import timezone
+
+retcode = 0
+terminate = False
+debug = False
+
+def DEVNULL():
+ '''Return /dev/null in subprocess writable format.'''
+ try:
+ from subprocess import DEVNULL
+ return DEVNULL
+ except ImportError:
+ return open(os.devnull, 'wb')
+
+def find_mounts():
+ '''Map mountpoints to physical disks.'''
+ def find_bcachefs_mounts(bdev, fs, lastdisk):
+ '''Attach all lastdisk to each fs found under bdev.'''
+ if bdev['fstype'] == 'bcachefs' and bdev['mountpoint'] is not None:
+ mnt = bdev['mountpoint']
+ if mnt in fs:
+ fs[mnt].add(lastdisk.split(':'))
+ else:
+ fs[mnt] = set(lastdisk.split(':'))
+ if 'children' not in bdev:
+ return
+ for child in bdev['children']:
+ find_bcachefs_mounts(child, fs, lastdisk)
+
+ fs = {}
+ cmd=['lsblk', '-o', 'NAME,KNAME,TYPE,FSTYPE,MOUNTPOINT', '-J']
+ result = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+ result.wait()
+ if result.returncode != 0:
+ return fs
+ sarray = [x.decode(sys.stdout.encoding) for x in result.stdout.readlines()]
+ output = ' '.join(sarray)
+ bdevdata = json.loads(output)
+
+ # The lsblk output had better be in disks-then-partitions order
+ for bdev in bdevdata['blockdevices']:
+ lastdisk = bdev['kname']
+ find_bcachefs_mounts(bdev, fs, lastdisk)
+
+ return fs
+
+def backtick(cmd):
+ '''Generator function that yields lines of a program's stdout.'''
+ p = subprocess.Popen(cmd, stdout = subprocess.PIPE)
+ for line in TextIOWrapper(p.stdout, encoding="utf-8"):
+ yield line.strip()
+
+def remove_killfunc(killfuncs, fn):
+ '''Ensure fn is not in killfuncs.'''
+ try:
+ killfuncs.remove(fn)
+ except:
+ pass
+
+class scrub_control(object):
+ '''Control object for bcachefsck.'''
+ def __init__(self):
+ pass
+
+ def start(self):
+ '''Start scrub and wait for it to complete. Returns -1 if the
+ service was not started, 0 if it succeeded, or 1 if it
+ failed.'''
+ assert False
+
+ def stop(self):
+ '''Stop scrub.'''
+ assert False
+
+class scrub_subprocess(scrub_control):
+ '''Control object for bcachefsck subprocesses.'''
+ def __init__(self, mnt):
+ cmd = ['bcachefs', 'fsck']
+ cmd += '@bcachefsck_args@'.split()
+ cmd += [mnt]
+ self.cmdline = cmd
+ self.proc = None
+
+ def start(self):
+ '''Start bcachefsck and wait for it to complete. Returns -1 if
+ the service was not started, 0 if it succeeded, or 1 if it
+ failed.'''
+ global debug
+
+ if debug:
+ print('run ', ' '.join(self.cmdline))
+
+ try:
+ self.proc = subprocess.Popen(self.cmdline)
+ self.proc.wait()
+ except:
+ return -1
+
+ proc = self.proc
+ self.proc = None
+ return proc.returncode
+
+ def stop(self):
+ '''Stop bcachefsck.'''
+ global debug
+
+ if debug:
+ print('kill ', ' '.join(self.cmdline))
+ if self.proc is not None:
+ self.proc.terminate()
+
+def run_subprocess(mnt, killfuncs):
+ '''Run a killable program. Returns program retcode or -1 if we can't
+ start it.'''
+ try:
+ p = scrub_subprocess(mnt)
+ killfuncs.add(p.stop)
+ ret = p.start()
+ remove_killfunc(killfuncs, p.stop)
+ return ret
+ except:
+ return -1
+
+# systemd doesn't like unit instance names with slashes in them, so it
+# replaces them with dashes when it invokes the service. Filesystem paths
+# need a special --path argument so that dashes do not get mangled.
+def path_to_serviceunit(path):
+ '''Convert a pathname into a systemd service unit name.'''
+
+ svcname = 'bcachefsck@.service'
+ cmd = ['systemd-escape', '--template', svcname, '--path', path]
+
+ proc = subprocess.Popen(cmd, stdout = subprocess.PIPE)
+ proc.wait()
+ for line in proc.stdout:
+ return line.decode(sys.stdout.encoding).strip()
+
+def fibonacci(max_ret):
+ '''Yield fibonacci sequence up to but not including max_ret.'''
+ if max_ret < 1:
+ return
+
+ x = 0
+ y = 1
+ yield 1
+
+ z = x + y
+ while z <= max_ret:
+ yield z
+ x = y
+ y = z
+ z = x + y
+
+class scrub_service(scrub_control):
+ '''Control object for bcachefsck systemd service.'''
+ def __init__(self, mnt):
+ self.unitname = path_to_serviceunit(mnt)
+ self.prop = None
+ self.unit = None
+ self.bind()
+
+ def bind(self):
+ '''Bind to the dbus proxy object for this service.'''
+ sysbus = dbus.SystemBus()
+ systemd1 = sysbus.get_object('org.freedesktop.systemd1',
+ '/org/freedesktop/systemd1')
+ manager = dbus.Interface(systemd1,
+ 'org.freedesktop.systemd1.Manager')
+ path = manager.LoadUnit(self.unitname)
+
+ svc_obj = sysbus.get_object('org.freedesktop.systemd1', path)
+ self.prop = dbus.Interface(svc_obj,
+ 'org.freedesktop.DBus.Properties')
+ self.unit = dbus.Interface(svc_obj,
+ 'org.freedesktop.systemd1.Unit')
+
+ def __dbusrun(self, lambda_fn):
+ '''Call the lambda function to execute something on dbus. dbus
+ exceptions result in retries with Fibonacci backoff, and the
+ bindings will be rebuilt every time.'''
+ global debug
+
+ fatal_ex = None
+
+ for i in fibonacci(30):
+ try:
+ return lambda_fn()
+ except dbus.exceptions.DBusException as e:
+ if debug:
+ print(e)
+ fatal_ex = e
+ time.sleep(i)
+ self.bind()
+ raise fatal_ex
+
+ def state(self):
+ '''Retrieve the active state for a systemd service. As of
+ systemd 249, this is supposed to be one of the following:
+ "active", "reloading", "inactive", "failed", "activating",
+ or "deactivating". These strings are not localized.'''
+ global debug
+
+ l = lambda: self.prop.Get('org.freedesktop.systemd1.Unit',
+ 'ActiveState')
+ try:
+ return self.__dbusrun(l)
+ except Exception as e:
+ if debug:
+ print(e, file = sys.stderr)
+ return 'failed'
+
+ def wait(self, interval = 1):
+ '''Wait until the service finishes.'''
+ global debug
+
+ # Use a poll/sleep loop to wait for the service to finish.
+ # Avoid adding a dependency on python3 glib, which is required
+ # to use an event loop to receive a dbus signal.
+ s = self.state()
+ while s not in ['failed', 'inactive']:
+ if debug:
+ print('waiting %s %s' % (self.unitname, s))
+ time.sleep(interval)
+ s = self.state()
+ if debug:
+ print('waited %s %s' % (self.unitname, s))
+ if s == 'failed':
+ return 1
+ return 0
+
+ def start(self):
+ '''Start the service and wait for it to complete. Returns -1
+ if the service was not started, 0 if it succeeded, or 1 if it
+ failed.'''
+ global debug
+
+ if debug:
+ print('starting %s' % self.unitname)
+
+ try:
+ self.__dbusrun(lambda: self.unit.Start('replace'))
+ return self.wait()
+ except Exception as e:
+ print(e, file = sys.stderr)
+ return -1
+
+ def stop(self):
+ '''Stop the service.'''
+ global debug
+
+ if debug:
+ print('stopping %s' % self.unitname)
+
+ try:
+ self.__dbusrun(lambda: self.unit.Stop('replace'))
+ return self.wait()
+ except Exception as e:
+ print(e, file = sys.stderr)
+ return -1
+
+def run_service(mnt, killfuncs):
+ '''Run scrub as a service.'''
+ try:
+ svc = scrub_service(mnt)
+ except:
+ return -1
+
+ killfuncs.add(svc.stop)
+ retcode = svc.start()
+ remove_killfunc(killfuncs, svc.stop)
+ return retcode
+
+def run_scrub(mnt, cond, running_devs, mntdevs, killfuncs):
+ '''Run a scrub process.'''
+ global retcode, terminate
+
+ print("Scrubbing %s..." % mnt)
+ sys.stdout.flush()
+
+ try:
+ if terminate:
+ return
+
+ # Run per-mount systemd bcachefsck service only if we ourselves
+ # are running as a systemd service.
+ if 'SERVICE_MODE' in os.environ:
+ ret = run_service(mnt, killfuncs)
+ if ret == 0 or ret == 1:
+ print("Scrubbing %s done, (err=%d)" % (mnt, ret))
+ sys.stdout.flush()
+ retcode |= ret
+ return
+
+ if terminate:
+ return
+
+ # Invoke bcachefsck manually if we're running in the foreground.
+ # We also permit this if we're running as a cronjob where
+ # systemd services are unavailable.
+ ret = run_subprocess(mnt, killfuncs)
+ if ret >= 0:
+ print("Scrubbing %s done, (err=%d)" % (mnt, ret))
+ sys.stdout.flush()
+ retcode |= ret
+ return
+
+ if terminate:
+ return
+
+ print("Unable to start scrub tool.")
+ sys.stdout.flush()
+ finally:
+ running_devs -= mntdevs
+ cond.acquire()
+ cond.notify()
+ cond.release()
+
+def signal_scrubs(signum, cond):
+ '''Handle termination signals by killing bcachefsck children.'''
+ global debug, terminate
+
+ if debug:
+ print('Signal handler called with signal', signum)
+ sys.stdout.flush()
+
+ terminate = True
+ cond.acquire()
+ cond.notify()
+ cond.release()
+
+def wait_for_termination(cond, killfuncs):
+ '''Wait for a child thread to terminate. Returns True if we should
+ abort the program, False otherwise.'''
+ global debug, terminate
+
+ if debug:
+ print('waiting for threads to terminate')
+ sys.stdout.flush()
+
+ cond.acquire()
+ try:
+ cond.wait()
+ except KeyboardInterrupt:
+ terminate = True
+ cond.release()
+
+ if not terminate:
+ return False
+
+ print("Terminating...")
+ sys.stdout.flush()
+ while len(killfuncs) > 0:
+ fn = killfuncs.pop()
+ fn()
+ return True
+
+def scan_interval(string):
+ '''Convert a textual scan interval argument into a time delta.'''
+
+ if string.endswith('y'):
+ year = timedelta(seconds = 31556952)
+ return year * float(string[:-1])
+ if string.endswith('q'):
+ return timedelta(days = 90 * float(string[:-1]))
+ if string.endswith('mo'):
+ return timedelta(days = 30 * float(string[:-2]))
+ if string.endswith('w'):
+ return timedelta(weeks = float(string[:-1]))
+ if string.endswith('d'):
+ return timedelta(days = float(string[:-1]))
+ if string.endswith('h'):
+ return timedelta(hours = float(string[:-1]))
+ if string.endswith('m'):
+ return timedelta(minutes = float(string[:-1]))
+ if string.endswith('s'):
+ return timedelta(seconds = float(string[:-1]))
+ return timedelta(seconds = int(string))
+
+def utcnow():
+ '''Create a representation of the time right now, in UTC.'''
+
+ dt = datetime.utcnow()
+ return dt.replace(tzinfo = timezone.utc)
+
+def main():
+ '''Find mounts, schedule bcachefsck runs.'''
+ def thr(mnt, devs):
+ a = (mnt, cond, running_devs, devs, killfuncs)
+ thr = threading.Thread(target = run_scrub, args = a)
+ thr.start()
+ global retcode, terminate, debug
+
+ parser = argparse.ArgumentParser( \
+ description = "Scrub all mounted bcachefs filesystems.")
+ parser.add_argument("--debug", help = "Enabling debugging messages.", \
+ action = "store_true")
+ args = parser.parse_args()
+
+ if args.debug:
+ debug = True
+
+ fs = find_mounts()
+
+ # Schedule scrub jobs...
+ running_devs = set()
+ killfuncs = set()
+ cond = threading.Condition()
+
+ signal.signal(signal.SIGINT, lambda s, f: signal_scrubs(s, cond))
+ signal.signal(signal.SIGTERM, lambda s, f: signal_scrubs(s, cond))
+
+ while len(fs) > 0:
+ if len(running_devs) == 0:
+ mnt, devs = fs.popitem()
+ running_devs.update(devs)
+ thr(mnt, devs)
+ poppers = set()
+ for mnt in fs:
+ devs = fs[mnt]
+ can_run = True
+ for dev in devs:
+ if dev in running_devs:
+ can_run = False
+ break
+ if can_run:
+ running_devs.update(devs)
+ poppers.add(mnt)
+ thr(mnt, devs)
+ for p in poppers:
+ fs.pop(p)
+
+ # Wait for one thread to finish
+ if wait_for_termination(cond, killfuncs):
+ break
+
+ # Wait for the rest of the threads to finish
+ while len(killfuncs) > 0:
+ wait_for_termination(cond, killfuncs)
+
+ # If we're being run as a service, the return code must fit the LSB
+ # init script action error guidelines, which is to say that we compress
+ # all errors to 1 ("generic or unspecified error", LSB 5.0 section
+ # 22.2) and hope the admin will scan the log for what actually
+ # happened.
+ #
+ # We have to sleep 2 seconds here because journald uses the pid to
+ # connect our log messages to the systemd service. This is critical
+ # for capturing all the log messages if the scrub fails, because the
+ # fail service uses the service name to gather log messages for the
+ # error report.
+ if 'SERVICE_MODE' in os.environ:
+ time.sleep(2)
+ if retcode != 0:
+ retcode = 1
+
+ sys.exit(retcode)
+
+if __name__ == '__main__':
+ main()
diff --git a/fsck/bcachefsck_all.service.in b/fsck/bcachefsck_all.service.in
new file mode 100644
index 00000000..f465473d
--- /dev/null
+++ b/fsck/bcachefsck_all.service.in
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle. All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Online bcachefsck for All Filesystems
+OnFailure=bcachefsck_all_fail.service
+ConditionACPower=true
+Documentation=man:bcachefsck_all(8)
+After=paths.target multi-user.target network.target network-online.target systemd-networkd.service NetworkManager.service connman.service
+
+[Service]
+Type=oneshot
+Environment=SERVICE_MODE=1
+ExecStart=bcachefsck_all
+SyslogIdentifier=bcachefsck_all
+
+# Create the service underneath the scrub background service slice so that we
+# can control resource usage.
+Slice=system-bcachefsck.slice
+
+# Run scrub_all with minimal CPU and IO priority so that nothing will starve.
+IOSchedulingClass=idle
+CPUSchedulingPolicy=idle
+CPUAccounting=true
+Nice=19
+
+# No realtime scheduling
+RestrictRealtime=true
+
+# No special privileges, but we still have to run as root so that we can
+# contact the service manager to start the sub-units.
+CapabilityBoundingSet=
+NoNewPrivileges=true
+RestrictSUIDSGID=true
+
+# Make the entire filesystem readonly except for the media scan stamp file
+# directory. We don't want to hide anything because we need to find all
+# mounted bcachefs filesystems in the host.
+ProtectSystem=strict
+ProtectHome=read-only
+PrivateTmp=false
+
+# No network access except to the systemd control socket
+PrivateNetwork=true
+ProtectHostname=true
+RestrictAddressFamilies=AF_UNIX
+IPAddressDeny=any
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+
+# Hide everything in /proc, even /proc/mounts
+ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+SystemCallFilter=~@mount
+
+# Media scan stamp file shouldn't be readable by regular users
+UMask=0077
+
+# lsblk ignores mountpoints if it can't find the device files, so we cannot
+# hide them
+#ProtectClock=true
+#PrivateDevices=true
diff --git a/fsck/bcachefsck_all.timer b/fsck/bcachefsck_all.timer
new file mode 100644
index 00000000..65470d40
--- /dev/null
+++ b/fsck/bcachefsck_all.timer
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle. All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Periodic bcachefsck for All Filesystems
+
+[Timer]
+# Run on Sunday at 3:10am, to avoid running afoul of DST changes
+OnCalendar=Sun *-*-* 03:10:00
+RandomizedDelaySec=60
+Persistent=true
+
+[Install]
+WantedBy=timers.target
diff --git a/fsck/bcachefsck_all_fail.service.in b/fsck/bcachefsck_all_fail.service.in
new file mode 100644
index 00000000..b79f8196
--- /dev/null
+++ b/fsck/bcachefsck_all_fail.service.in
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle. All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Online bcachefsck for All Filesystems Failure Reporting
+Documentation=man:bcachefsck_all(8)
+
+[Service]
+Type=oneshot
+Environment=EMAIL_ADDR=root
+ExecStart=@libdir@/bcachefsck_fail "${EMAIL_ADDR}" bcachefsck_all
+User=mail
+Group=mail
+SupplementaryGroups=systemd-journal
+
+# No realtime scheduling
+RestrictRealtime=true
+
+# Make the entire filesystem readonly and /home inaccessible.
+ProtectSystem=full
+ProtectHome=yes
+PrivateTmp=true
+RestrictSUIDSGID=true
+
+# Emailing reports requires network access, but not the ability to change the
+# hostname.
+ProtectHostname=true
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+
+# Can't hide /proc because journalctl needs it to find various pieces of log
+# information
+#ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+SystemCallFilter=~@mount
+
+# xfs_scrub needs these privileges to run, and no others
+CapabilityBoundingSet=
+NoNewPrivileges=true
+
+# Failure reporting shouldn't create world-readable files
+UMask=0077
+
+# Clean up any IPC objects when this unit stops
+RemoveIPC=true
+
+# No access to hardware device files
+PrivateDevices=true
+ProtectClock=true
diff --git a/fsck/bcachefsck_fail b/fsck/bcachefsck_fail
new file mode 100755
index 00000000..283cee70
--- /dev/null
+++ b/fsck/bcachefsck_fail
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle. All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+# Email logs of failed bcachefsck and bcachefsck_all unit runs
+
+recipient="$1"
+test -z "${recipient}" && exit 0
+service="$2"
+test -z "${service}" && exit 0
+mntpoint="$3"
+
+hostname="$(hostname -f 2>/dev/null)"
+test -z "${hostname}" && hostname="${HOSTNAME}"
+
+mailer="$(command -v sendmail)"
+if [ ! -x "${mailer}" ]; then
+ echo "${mailer}: Mailer program not found."
+ exit 1
+fi
+
+fail_mail_mntpoint() {
+ local scrub_svc
+
+ # Turn the mountpoint into a properly escaped systemd instance name
+ scrub_svc="$(systemd-escape --template "${service}@.service" --path "${mntpoint}")"
+ cat << ENDL
+To: ${recipient}
+From: <${service}@${hostname}>
+Subject: ${service} failure on ${mntpoint}
+Content-Transfer-Encoding: 8bit
+Content-Type: text/plain; charset=UTF-8
+
+So sorry, the automatic ${service} of ${mntpoint} on ${hostname} failed.
+Please do not reply to this mesage.
+
+A log of what happened follows:
+ENDL
+ systemctl status --full --lines 4294967295 "${scrub_svc}"
+}
+
+fail_mail() {
+ cat << ENDL
+To: ${recipient}
+From: <${service}@${hostname}>
+Subject: ${service} failure
+
+So sorry, the automatic ${service} on ${hostname} failed.
+
+A log of what happened follows:
+ENDL
+ systemctl status --full --lines 4294967295 "${service}"
+}
+
+if [ -n "${mntpoint}" ]; then
+ fail_mail_mntpoint | "${mailer}" -t -i
+else
+ fail_mail | "${mailer}" -t -i
+fi
+exit "${PIPESTATUS[1]}"
diff --git a/fsck/bcachefsck_fail@.service.in b/fsck/bcachefsck_fail@.service.in
new file mode 100644
index 00000000..369a809a
--- /dev/null
+++ b/fsck/bcachefsck_fail@.service.in
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle. All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Online bcachefsck Failure Reporting for %f
+Documentation=man:bcachefs(8)
+
+[Service]
+Type=oneshot
+Environment=EMAIL_ADDR=root
+ExecStart=@libdir@/bcachefsck_fail "${EMAIL_ADDR}" bcachefs %f
+User=mail
+Group=mail
+SupplementaryGroups=systemd-journal
+
+# Create the service underneath the background service slice so that we can
+# control resource usage.
+Slice=system-bcachefsck.slice
+
+# No realtime scheduling
+RestrictRealtime=true
+
+# Make the entire filesystem readonly and /home inaccessible.
+ProtectSystem=full
+ProtectHome=yes
+PrivateTmp=true
+RestrictSUIDSGID=true
+
+# Emailing reports requires network access, but not the ability to change the
+# hostname.
+ProtectHostname=true
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+
+# Can't hide /proc because journalctl needs it to find various pieces of log
+# information
+#ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+SystemCallFilter=~@mount
+
+# xfs_scrub needs these privileges to run, and no others
+CapabilityBoundingSet=
+NoNewPrivileges=true
+
+# Failure reporting shouldn't create world-readable files
+UMask=0077
+
+# Clean up any IPC objects when this unit stops
+RemoveIPC=true
+
+# No access to hardware device files
+PrivateDevices=true
+ProtectClock=true
diff --git a/fsck/system-bcachefsck.slice b/fsck/system-bcachefsck.slice
new file mode 100644
index 00000000..ea368032
--- /dev/null
+++ b/fsck/system-bcachefsck.slice
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle. All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=bcachefsck background service slice
+Before=slices.target
+
+[Slice]
+
+# If the CPU usage cgroup controller is available, don't use more than 60% of a
+# single core for all background processes.
+CPUQuota=60%
+CPUAccounting=true
+
+[Install]
+# As of systemd 249, the systemd cgroupv2 configuration code will drop resource
+# controllers from the root and system.slice cgroups at startup if it doesn't
+# find any direct dependencies that require a given controller. Newly
+# activated units with resource control directives are created under the system
+# slice but do not cause a reconfiguration of the slice's resource controllers.
+# Hence we cannot put CPUQuota= into the bcachefsck service units directly.
+#
+# For the CPUQuota directive to have any effect, we must therefore create an
+# explicit definition file for the slice that systemd creates to contain the
+# bcachefsck instance units (e.g. bcachefsck@.service) and we must configure this
+# slice as a dependency of the system slice to establish the direct dependency
+# relation.
+WantedBy=system.slice
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index bb2a0cc4..66de8c0c 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -264,36 +264,54 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
+void __bch2_print(struct bch_fs *c, const char *fmt, ...);
+
+#define maybe_dev_to_fs(_c) _Generic((_c), \
+ struct bch_dev *: ((struct bch_dev *) (_c))->fs, \
+ struct bch_fs *: (_c))
+
+#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__)
+
+#define bch2_print_ratelimited(_c, ...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ \
+ if (__ratelimit(&_rs)) \
+ bch2_print(_c, __VA_ARGS__); \
+} while (0)
+
#define bch_info(c, fmt, ...) \
- printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_notice(c, fmt, ...) \
- printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn(c, fmt, ...) \
- printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn_ratelimited(c, fmt, ...) \
- printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err(c, fmt, ...) \
- printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err_dev(ca, fmt, ...) \
- printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
#define bch_err_dev_offset(ca, _offset, fmt, ...) \
- printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
#define bch_err_inum(c, _inum, fmt, ...) \
- printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
- printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
#define bch_err_ratelimited(c, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err_dev_ratelimited(ca, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
#define bch_err_fn(_c, _ret) \
do { \
@@ -446,6 +464,12 @@ enum bch_time_stats {
struct btree;
+struct log_output {
+ spinlock_t lock;
+ wait_queue_head_t wait;
+ struct printbuf buf;
+};
+
enum gc_phase {
GC_PHASE_NOT_RUNNING,
GC_PHASE_START,
@@ -700,6 +724,7 @@ struct bch_fs {
struct super_block *vfs_sb;
dev_t dev;
char name[40];
+ struct log_output *output;
/* ro/rw, add/remove/resize devices: */
struct rw_semaphore state_lock;
diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h
index 43822c17..2ac6272c 100644
--- a/libbcachefs/bcachefs_ioctl.h
+++ b/libbcachefs/bcachefs_ioctl.h
@@ -83,6 +83,10 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2)
+#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
+
+#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
+
/* ioctl below act on a particular file, not the filesystem as a whole: */
#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
@@ -386,4 +390,24 @@ struct bch_ioctl_subvolume {
#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0)
#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1)
+/*
+ * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command,
+ * but with the kernel's implementation of fsck:
+ */
+struct bch_ioctl_fsck_offline {
+ __u64 flags;
+ __u64 opts; /* string */
+ __u64 nr_devs;
+ __u64 devs[0];
+};
+
+/*
+ * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command,
+ * but with the kernel's implementation of fsck:
+ */
+struct bch_ioctl_fsck_online {
+ __u64 flags;
+ __u64 opts; /* string */
+};
+
#endif /* _BCACHEFS_IOCTL_H */
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 47e7770d..9574c8c4 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -9,6 +9,7 @@
#include "debug.h"
#include "errcode.h"
#include "error.h"
+#include "journal.h"
#include "trace.h"
#include <linux/prefetch.h>
@@ -424,14 +425,11 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
BUG_ON(btree_node_read_in_flight(b) ||
btree_node_write_in_flight(b));
- if (btree_node_dirty(b))
- bch2_btree_complete_write(c, b, btree_current_write(b));
- clear_btree_node_dirty_acct(c, b);
-
btree_node_data_free(c, b);
}
- BUG_ON(atomic_read(&c->btree_cache.dirty));
+ BUG_ON(!bch2_journal_error(&c->journal) &&
+ atomic_read(&c->btree_cache.dirty));
list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
@@ -502,19 +500,21 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
* cannibalize_bucket() will take. This means every time we unlock the root of
* the btree, we need to release this lock if we have it held.
*/
-void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
if (bc->alloc_lock == current) {
- trace_and_count(c, btree_cache_cannibalize_unlock, c);
+ trace_and_count(c, btree_cache_cannibalize_unlock, trans);
bc->alloc_lock = NULL;
closure_wake_up(&bc->alloc_wait);
}
}
-int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct task_struct *old;
@@ -523,7 +523,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
if (!cl) {
- trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
}
@@ -537,11 +537,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
}
- trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
success:
- trace_and_count(c, btree_cache_cannibalize_lock, c);
+ trace_and_count(c, btree_cache_cannibalize_lock, trans);
return 0;
}
@@ -675,7 +675,7 @@ err:
mutex_unlock(&bc->lock);
- trace_and_count(c, btree_cache_cannibalize, c);
+ trace_and_count(c, btree_cache_cannibalize, trans);
goto out;
}
@@ -751,7 +751,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
if (path && sync)
bch2_trans_unlock_noassert(trans);
- bch2_btree_node_read(c, b, sync);
+ bch2_btree_node_read(trans, b, sync);
if (!sync)
return NULL;
@@ -1041,7 +1041,7 @@ retry:
goto retry;
if (IS_ERR(b) &&
- !bch2_btree_cache_cannibalize_lock(c, NULL))
+ !bch2_btree_cache_cannibalize_lock(trans, NULL))
goto retry;
if (IS_ERR(b))
@@ -1089,7 +1089,7 @@ lock_node:
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
btree_check_header(c, b);
out:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return b;
}
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index cfb80b20..4e1af588 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -17,8 +17,8 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
-void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
-int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
+void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
+int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 3c663c59..a6ac68fe 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1575,16 +1575,17 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
return 0;
}
-void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bool sync)
{
+ struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct btree_read_bio *rb;
struct bch_dev *ca;
struct bio *bio;
int ret;
- trace_and_count(c, btree_node_read, c, b);
+ trace_and_count(c, btree_node_read, trans, b);
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
@@ -1663,12 +1664,12 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
closure_init_stack(&cl);
do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
b = bch2_btree_node_mem_alloc(trans, level != 0);
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
BUG_ON(IS_ERR(b));
@@ -1677,7 +1678,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
set_btree_node_read_in_flight(b);
- bch2_btree_node_read(c, b, true);
+ bch2_btree_node_read(trans, b, true);
if (btree_node_read_error(b)) {
bch2_btree_node_hash_remove(&c->btree_cache, b);
@@ -1704,8 +1705,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
}
-void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
- struct btree_write *w)
+static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
+ struct btree_write *w)
{
unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 7e03dd76..e251cb6b 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -130,13 +130,10 @@ void bch2_btree_init_next(struct btree_trans *, struct btree *);
int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
struct btree *, bool, bool *);
-void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
+void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);
-void bch2_btree_complete_write(struct bch_fs *, struct btree *,
- struct btree_write *);
-
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
enum btree_write_flags {
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 4d673d47..929f33df 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -977,7 +977,7 @@ retry_all:
closure_init_stack(&cl);
do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
}
@@ -1013,7 +1013,7 @@ retry_all:
* then failed to relock a path - that's fine.
*/
err:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
trans->in_traverse_all = false;
@@ -1298,7 +1298,7 @@ static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path
{
__bch2_btree_path_unlock(trans, path);
btree_path_list_remove(trans, path);
- trans->paths_allocated &= ~(1ULL << path->idx);
+ __clear_bit(path->idx, trans->paths_allocated);
}
void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
@@ -1471,6 +1471,7 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
{
struct btree_transaction_stats *s = btree_trans_stats(trans);
struct printbuf buf = PRINTBUF;
+ size_t nr = bitmap_weight(trans->paths_allocated, BTREE_ITER_MAX);
if (!s)
return;
@@ -1479,9 +1480,8 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
if (!buf.allocation_failure) {
mutex_lock(&s->lock);
- if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
- s->nr_max_paths = trans->nr_max_paths =
- hweight64(trans->paths_allocated);
+ if (nr > s->nr_max_paths) {
+ s->nr_max_paths = nr;
swap(s->max_paths_text, buf.buf);
}
mutex_unlock(&s->lock);
@@ -1489,7 +1489,7 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
printbuf_exit(&buf);
- trans->nr_max_paths = hweight64(trans->paths_allocated);
+ trans->nr_max_paths = nr;
}
noinline __cold
@@ -1518,13 +1518,12 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
struct btree_path *pos)
{
struct btree_path *path;
- unsigned idx;
+ size_t idx = find_first_zero_bit(trans->paths_allocated, BTREE_ITER_MAX);
- if (unlikely(trans->paths_allocated ==
- ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
+ if (unlikely(idx == BTREE_ITER_MAX))
btree_path_overflow(trans);
- idx = __ffs64(~trans->paths_allocated);
+ BUG_ON(idx > BTREE_ITER_MAX);
/*
* Do this before marking the new path as allocated, since it won't be
@@ -1533,7 +1532,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
if (unlikely(idx > trans->nr_max_paths))
bch2_trans_update_max_paths(trans);
- trans->paths_allocated |= 1ULL << idx;
+ __set_bit(idx, trans->paths_allocated);
path = &trans->paths[idx];
path->idx = idx;
@@ -2503,7 +2502,7 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
struct btree_path *path;
unsigned i;
- BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated));
+ BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, BTREE_ITER_MAX));
trans_for_each_path(trans, path) {
BUG_ON(path->sorted_idx >= trans->nr_sorted);
@@ -2513,7 +2512,7 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
for (i = 0; i < trans->nr_sorted; i++) {
unsigned idx = trans->sorted[i];
- EBUG_ON(!(trans->paths_allocated & (1ULL << idx)));
+ BUG_ON(!test_bit(idx, trans->paths_allocated));
BUG_ON(trans->paths[idx].sorted_idx != i);
}
}
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 75beb183..ea4fc8a2 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -66,17 +66,10 @@ static inline void btree_trans_sort_paths(struct btree_trans *trans)
static inline struct btree_path *
__trans_next_path(struct btree_trans *trans, unsigned idx)
{
- u64 l;
-
+ idx = find_next_bit(trans->paths_allocated, BTREE_ITER_MAX, idx);
if (idx == BTREE_ITER_MAX)
return NULL;
-
- l = trans->paths_allocated >> idx;
- if (!l)
- return NULL;
-
- idx += __ffs64(l);
- EBUG_ON(idx >= BTREE_ITER_MAX);
+ EBUG_ON(idx > BTREE_ITER_MAX);
EBUG_ON(trans->paths[idx].idx != idx);
return &trans->paths[idx];
}
@@ -92,17 +85,11 @@ __trans_next_path(struct btree_trans *trans, unsigned idx)
static inline struct btree_path *
__trans_next_path_safe(struct btree_trans *trans, unsigned *idx)
{
- u64 l;
-
+ *idx = find_next_bit(trans->paths_allocated, BTREE_ITER_MAX, *idx);
if (*idx == BTREE_ITER_MAX)
return NULL;
- l = trans->paths_allocated >> *idx;
- if (!l)
- return NULL;
-
- *idx += __ffs64(l);
- EBUG_ON(*idx >= BTREE_ITER_MAX);
+ EBUG_ON(*idx > BTREE_ITER_MAX);
return &trans->paths[*idx];
}
@@ -631,7 +618,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *);
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8)
+ if (bitmap_weight(trans->paths_allocated, BTREE_ITER_MAX) > BTREE_ITER_MAX - 8)
return __bch2_btree_trans_too_many_iters(trans);
return 0;
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index c5e8a461..b39b28b4 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -997,8 +997,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
list_for_each_entry_safe(ck, n, &items, list) {
cond_resched();
- bch2_journal_pin_drop(&c->journal, &ck->journal);
-
list_del(&ck->list);
kfree(ck->k);
six_lock_exit(&ck->c.lock);
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 11b0a2c8..a49f1dd1 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -263,7 +263,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
- EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
+ EBUG_ON(!test_bit(path->idx, trans->paths_allocated));
if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index ca752660..78d9f585 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -414,7 +414,7 @@ struct btree_trans {
unsigned extra_journal_res;
unsigned nr_max_paths;
- u64 paths_allocated;
+ unsigned long paths_allocated[BITS_TO_LONGS(BTREE_ITER_MAX)];
unsigned mem_top;
unsigned mem_max;
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
index ba42f578..254794c1 100644
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -531,6 +531,19 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
}
+static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_i *k)
+{
+ struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_copy(n, k);
+ return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
enum btree_id btree,
struct bkey_i *k)
@@ -541,6 +554,9 @@ int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+ if (unlikely(trans->journal_replay_not_finished))
+ return bch2_btree_insert_clone_trans(trans, btree, k);
+
trans_for_each_wb_update(trans, i) {
if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
bkey_copy(&i->k, k);
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index c9f07ca4..970faec1 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -163,9 +163,11 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
/* Btree node freeing/allocation: */
-static void __btree_node_free(struct bch_fs *c, struct btree *b)
+static void __btree_node_free(struct btree_trans *trans, struct btree *b)
{
- trace_and_count(c, btree_node_free, c, b);
+ struct bch_fs *c = trans->c;
+
+ trace_and_count(c, btree_node_free, trans, b);
BUG_ON(btree_node_write_blocked(b));
BUG_ON(btree_node_dirty(b));
@@ -191,7 +193,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
bch2_btree_node_hash_remove(&c->btree_cache, b);
- __btree_node_free(c, b);
+ __btree_node_free(trans, b);
six_unlock_write(&b->c.lock);
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
@@ -362,7 +364,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as,
ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
BUG_ON(ret);
- trace_and_count(c, btree_node_alloc, c, b);
+ trace_and_count(c, btree_node_alloc, trans, b);
bch2_increment_clock(c, btree_sectors(c), WRITE);
return b;
}
@@ -452,7 +454,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
- __btree_node_free(c, b);
+ __btree_node_free(trans, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
@@ -465,7 +467,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
unsigned flags,
struct closure *cl)
{
- struct bch_fs *c = as->c;
struct btree *b;
unsigned interior;
int ret = 0;
@@ -476,7 +477,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve:
*/
- ret = bch2_btree_cache_cannibalize_lock(c, cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, cl);
if (ret)
return ret;
@@ -495,7 +496,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
}
}
err:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return ret;
}
@@ -1067,6 +1068,17 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
+ if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
+ watermark < c->journal.watermark) {
+ struct journal_res res = { 0 };
+
+ ret = drop_locks_do(trans,
+ bch2_journal_res_get(&c->journal, &res, 1,
+ watermark|JOURNAL_RES_GET_CHECK));
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
while (1) {
nr_nodes[!!update_level] += 1 + split;
update_level++;
@@ -1211,7 +1223,7 @@ static void bch2_btree_set_root(struct btree_update *as,
struct bch_fs *c = as->c;
struct btree *old;
- trace_and_count(c, btree_node_set_root, c, b);
+ trace_and_count(c, btree_node_set_root, trans, b);
old = btree_node_root(c, b);
@@ -1465,7 +1477,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
struct btree *n[2];
- trace_and_count(c, btree_node_split, c, b);
+ trace_and_count(c, btree_node_split, trans, b);
n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
@@ -1523,7 +1535,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
}
} else {
- trace_and_count(c, btree_node_compact, c, b);
+ trace_and_count(c, btree_node_compact, trans, b);
n1 = bch2_btree_node_alloc_replacement(as, trans, b);
@@ -1843,7 +1855,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if (ret)
goto err;
- trace_and_count(c, btree_node_merge, c, b);
+ trace_and_count(c, btree_node_merge, trans, b);
bch2_btree_interior_update_will_free_node(as, b);
bch2_btree_interior_update_will_free_node(as, m);
@@ -1946,7 +1958,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, new_path, n);
- trace_and_count(c, btree_node_rewrite, c, b);
+ trace_and_count(c, btree_node_rewrite, trans, b);
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
@@ -2228,7 +2240,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
* btree_iter_traverse():
*/
if (btree_ptr_hash_val(new_key) != b->hash_val) {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
if (ret) {
ret = drop_locks_do(trans, (closure_sync(&cl), 0));
if (ret)
@@ -2252,7 +2264,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
six_unlock_intent(&new_hash->c.lock);
}
closure_sync(&cl);
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return ret;
}
@@ -2313,12 +2325,12 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
closure_init_stack(&cl);
do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
b = bch2_btree_node_mem_alloc(trans, false);
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
set_btree_node_fake(b);
set_btree_node_need_rewrite(b);
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 312bd0c8..27c74388 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -1164,107 +1164,6 @@ int bch2_mark_reservation(struct btree_trans *trans,
return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
}
-static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 start, u64 end,
- u64 *idx, unsigned flags, size_t r_idx)
-{
- struct bch_fs *c = trans->c;
- struct reflink_gc *r;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- u64 next_idx = end;
- s64 ret = 0;
- struct printbuf buf = PRINTBUF;
-
- if (r_idx >= c->reflink_gc_nr)
- goto not_found;
-
- r = genradix_ptr(&c->reflink_gc_table, r_idx);
- next_idx = min(next_idx, r->offset - r->size);
- if (*idx < next_idx)
- goto not_found;
-
- BUG_ON((s64) r->refcount + add < 0);
-
- r->refcount += add;
- *idx = r->offset;
- return 0;
-not_found:
- if (fsck_err(c, reflink_p_to_missing_reflink_v,
- "pointer to missing indirect extent\n"
- " %s\n"
- " missing range %llu-%llu",
- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
- *idx, next_idx)) {
- struct bkey_i_error *new;
-
- new = bch2_trans_kmalloc(trans, sizeof(*new));
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- bkey_init(&new->k);
- new->k.type = KEY_TYPE_error;
- new->k.p = bkey_start_pos(p.k);
- new->k.p.offset += *idx - start;
- bch2_key_resize(&new->k, next_idx - *idx);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
- BTREE_TRIGGER_NORUN);
- }
-
- *idx = next_idx;
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int __mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- struct reflink_gc *ref;
- size_t l, r, m;
- u64 idx = le64_to_cpu(p.v->idx), start = idx;
- u64 end = le64_to_cpu(p.v->idx) + p.k->size;
- int ret = 0;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) {
- idx -= le32_to_cpu(p.v->front_pad);
- end += le32_to_cpu(p.v->back_pad);
- }
-
- l = 0;
- r = c->reflink_gc_nr;
- while (l < r) {
- m = l + (r - l) / 2;
-
- ref = genradix_ptr(&c->reflink_gc_table, m);
- if (ref->offset <= idx)
- l = m + 1;
- else
- r = m;
- }
-
- while (idx < end && !ret)
- ret = __bch2_mark_reflink_p(trans, p, start, end,
- &idx, flags, l++);
-
- return ret;
-}
-
-int bch2_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
-}
-
void bch2_trans_fs_usage_revert(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{
@@ -1732,105 +1631,6 @@ int bch2_trans_mark_reservation(struct btree_trans *trans,
return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
}
-static int trans_mark_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 *idx, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i *k;
- __le64 *refcount;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- struct printbuf buf = PRINTBUF;
- int ret;
-
- k = bch2_bkey_get_mut_noupdate(trans, &iter,
- BTREE_ID_reflink, POS(0, *idx),
- BTREE_ITER_WITH_UPDATES);
- ret = PTR_ERR_OR_ZERO(k);
- if (ret)
- goto err;
-
- refcount = bkey_refcount(k);
- if (!refcount) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- bch2_trans_inconsistent(trans,
- "nonexistent indirect extent at %llu while marking\n %s",
- *idx, buf.buf);
- ret = -EIO;
- goto err;
- }
-
- if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- bch2_trans_inconsistent(trans,
- "indirect extent refcount underflow at %llu while marking\n %s",
- *idx, buf.buf);
- ret = -EIO;
- goto err;
- }
-
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
- u64 pad;
-
- pad = max_t(s64, le32_to_cpu(v->front_pad),
- le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
- BUG_ON(pad > U32_MAX);
- v->front_pad = cpu_to_le32(pad);
-
- pad = max_t(s64, le32_to_cpu(v->back_pad),
- k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
- BUG_ON(pad > U32_MAX);
- v->back_pad = cpu_to_le32(pad);
- }
-
- le64_add_cpu(refcount, add);
-
- bch2_btree_iter_set_pos_to_extent_start(&iter);
- ret = bch2_trans_update(trans, &iter, k, 0);
- if (ret)
- goto err;
-
- *idx = k->k.p.offset;
-err:
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int __trans_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
-{
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- u64 idx, end_idx;
- int ret = 0;
-
- idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
- end_idx = le64_to_cpu(p.v->idx) + p.k->size +
- le32_to_cpu(p.v->back_pad);
-
- while (idx < end_idx && !ret)
- ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
- return ret;
-}
-
-int bch2_trans_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
-{
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
-
- v->front_pad = v->back_pad = 0;
- }
-
- return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
-}
-
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
enum bch_data_type type,
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index bc088673..379101d7 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -335,14 +335,10 @@ int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s_c, unsigned);
int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-
#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
({ \
int ret = 0; \
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index ba0436ae..a042e07c 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -29,6 +29,63 @@ static int copy_to_user_errcode(void __user *to, const void *from, unsigned long
return copy_to_user(to, from, n) ? -EFAULT : 0;
}
+struct thread_with_file {
+ struct task_struct *task;
+ int ret;
+};
+
+static void thread_with_file_exit(struct thread_with_file *thr)
+{
+ if (thr->task) {
+ kthread_stop(thr->task);
+ put_task_struct(thr->task);
+ }
+}
+
+static int run_thread_with_file(struct thread_with_file *thr,
+ const struct file_operations *fops,
+ int (*fn)(void *), const char *fmt, ...)
+{
+ va_list args;
+ struct file *file = NULL;
+ int ret, fd = -1;
+ struct printbuf name = PRINTBUF;
+ unsigned fd_flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
+
+ va_start(args, fmt);
+ prt_vprintf(&name, fmt, args);
+ va_end(args);
+
+ thr->ret = 0;
+ thr->task = kthread_create(fn, thr, name.buf);
+ ret = PTR_ERR_OR_ZERO(thr->task);
+ if (ret)
+ goto err;
+
+ ret = get_unused_fd_flags(fd_flags);
+ if (ret < 0)
+ goto err_stop_task;
+ fd = ret;
+
+ file = anon_inode_getfile(name.buf, fops, thr, fd_flags);
+ ret = PTR_ERR_OR_ZERO(file);
+ if (ret)
+ goto err_put_fd;
+
+ fd_install(fd, file);
+ get_task_struct(thr->task);
+ wake_up_process(thr->task);
+ printbuf_exit(&name);
+ return fd;
+err_put_fd:
+ put_unused_fd(fd);
+err_stop_task:
+ kthread_stop(thr->task);
+err:
+ printbuf_exit(&name);
+ return ret;
+}
+
/* returns with ref on ca->ref */
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
unsigned flags)
@@ -138,8 +195,177 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
}
#endif
+struct fsck_thread {
+ struct thread_with_file thr;
+ struct printbuf buf;
+ struct bch_fs *c;
+ char **devs;
+ size_t nr_devs;
+ struct bch_opts opts;
+
+ struct log_output output;
+ DARRAY(char) output2;
+};
+
+static void bch2_fsck_thread_free(struct fsck_thread *thr)
+{
+ thread_with_file_exit(&thr->thr);
+ if (thr->devs)
+ for (size_t i = 0; i < thr->nr_devs; i++)
+ kfree(thr->devs[i]);
+ darray_exit(&thr->output2);
+ printbuf_exit(&thr->output.buf);
+ kfree(thr->devs);
+ kfree(thr);
+}
+
+static int bch2_fsck_thread_release(struct inode *inode, struct file *file)
+{
+ struct fsck_thread *thr = container_of(file->private_data, struct fsck_thread, thr);
+
+ bch2_fsck_thread_free(thr);
+ return 0;
+}
+
+static ssize_t bch2_fsck_thread_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ struct fsck_thread *thr = container_of(file->private_data, struct fsck_thread, thr);
+ size_t copied = 0, b;
+ int ret = 0;
+
+ ret = wait_event_interruptible(thr->output.wait,
+ thr->output.buf.pos || thr->output2.nr);
+ if (ret)
+ return ret;
+
+ while (len) {
+ ret = darray_make_room(&thr->output2, thr->output.buf.pos);
+ if (ret)
+ break;
+
+ spin_lock_irq(&thr->output.lock);
+ b = min_t(size_t, darray_room(thr->output2), thr->output.buf.pos);
+
+ memcpy(&darray_top(thr->output2), thr->output.buf.buf, b);
+ memmove(thr->output.buf.buf,
+ thr->output.buf.buf + b,
+ thr->output.buf.pos - b);
+
+ thr->output2.nr += b;
+ thr->output.buf.pos -= b;
+ spin_unlock_irq(&thr->output.lock);
+
+ b = min(len, thr->output2.nr);
+ if (!b)
+ break;
+
+ b -= copy_to_user(buf, thr->output2.data, b);
+ if (!b) {
+ ret = -EFAULT;
+ break;
+ }
+
+ copied += b;
+ buf += b;
+ len -= b;
+
+ memmove(thr->output2.data,
+ thr->output2.data + b,
+ thr->output2.nr - b);
+ thr->output2.nr -= b;
+ }
+
+ return copied ?: ret;
+}
+
+static const struct file_operations fsck_thread_ops = {
+ .release = bch2_fsck_thread_release,
+ .read = bch2_fsck_thread_read,
+ .llseek = no_llseek,
+};
+
+static int bch2_fsck_offline_thread_fn(void *arg)
+{
+ struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+ struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
+
+ thr->thr.ret = PTR_ERR_OR_ZERO(c);
+ if (!thr->thr.ret)
+ bch2_fs_stop(c);
+ return 0;
+}
+
+static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
+{
+ struct bch_ioctl_fsck_offline arg;
+ struct fsck_thread *thr = NULL;
+ u64 *devs = NULL;
+ long ret = 0;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) ||
+ !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) ||
+ !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ thr->nr_devs = arg.nr_devs;
+ thr->output.buf = PRINTBUF;
+ thr->output.buf.atomic++;
+ spin_lock_init(&thr->output.lock);
+ init_waitqueue_head(&thr->output.wait);
+ darray_init(&thr->output2);
+
+ if (copy_from_user(devs, &user_arg->devs[0], sizeof(user_arg->devs[0]) * arg.nr_devs)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ for (size_t i = 0; i < arg.nr_devs; i++) {
+ thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(thr->devs[i]);
+ if (ret)
+ goto err;
+ }
+
+ if (arg.opts) {
+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+
+ ret = PTR_ERR_OR_ZERO(optstr) ?:
+ bch2_parse_mount_opts(NULL, &thr->opts, optstr);
+ kfree(optstr);
+
+ if (ret)
+ goto err;
+ }
+
+ opt_set(thr->opts, log_output, (u64)(unsigned long)&thr->output);
+
+ ret = run_thread_with_file(&thr->thr,
+ &fsck_thread_ops,
+ bch2_fsck_offline_thread_fn,
+ "bch-fsck");
+err:
+ if (ret < 0) {
+ if (thr)
+ bch2_fsck_thread_free(thr);
+ pr_err("ret %s", bch2_err_str(ret));
+ }
+ kfree(devs);
+ return ret;
+}
+
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
{
+ long ret;
+
switch (cmd) {
#if 0
case BCH_IOCTL_ASSEMBLE:
@@ -147,9 +373,18 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg)
case BCH_IOCTL_INCREMENTAL:
return bch2_ioctl_incremental(arg);
#endif
+ case BCH_IOCTL_FSCK_OFFLINE: {
+ ret = bch2_ioctl_fsck_offline(arg);
+ break;
+ }
default:
- return -ENOTTY;
+ ret = -ENOTTY;
+ break;
}
+
+ if (ret < 0)
+ ret = bch2_err_class(ret);
+ return ret;
}
static long bch2_ioctl_query_uuid(struct bch_fs *c,
@@ -299,31 +534,27 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
}
struct bch_data_ctx {
+ struct thread_with_file thr;
+
struct bch_fs *c;
struct bch_ioctl_data arg;
struct bch_move_stats stats;
-
- int ret;
-
- struct task_struct *thread;
};
static int bch2_data_thread(void *arg)
{
- struct bch_data_ctx *ctx = arg;
-
- ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+ struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
+ ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
ctx->stats.data_type = U8_MAX;
return 0;
}
static int bch2_data_job_release(struct inode *inode, struct file *file)
{
- struct bch_data_ctx *ctx = file->private_data;
+ struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
- kthread_stop(ctx->thread);
- put_task_struct(ctx->thread);
+ thread_with_file_exit(&ctx->thr);
kfree(ctx);
return 0;
}
@@ -331,7 +562,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file)
static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
- struct bch_data_ctx *ctx = file->private_data;
+ struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
struct bch_fs *c = ctx->c;
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
@@ -357,10 +588,8 @@ static const struct file_operations bcachefs_data_ops = {
static long bch2_ioctl_data(struct bch_fs *c,
struct bch_ioctl_data arg)
{
- struct bch_data_ctx *ctx = NULL;
- struct file *file = NULL;
- unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
- int ret, fd = -1;
+ struct bch_data_ctx *ctx;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -375,36 +604,12 @@ static long bch2_ioctl_data(struct bch_fs *c,
ctx->c = c;
ctx->arg = arg;
- ctx->thread = kthread_create(bch2_data_thread, ctx,
- "bch-data/%s", c->name);
- if (IS_ERR(ctx->thread)) {
- ret = PTR_ERR(ctx->thread);
- goto err;
- }
-
- ret = get_unused_fd_flags(flags);
+ ret = run_thread_with_file(&ctx->thr,
+ &bcachefs_data_ops,
+ bch2_data_thread,
+ "bch-data/%s", c->name);
if (ret < 0)
- goto err;
- fd = ret;
-
- file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto err;
- }
-
- fd_install(fd, file);
-
- get_task_struct(ctx->thread);
- wake_up_process(ctx->thread);
-
- return fd;
-err:
- if (fd >= 0)
- put_unused_fd(fd);
- if (!IS_ERR_OR_NULL(ctx->thread))
- kthread_stop(ctx->thread);
- kfree(ctx);
+ kfree(ctx);
return ret;
}
@@ -690,6 +895,50 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
return ret;
}
+static int bch2_fsck_online_thread_fn(void *arg)
+{
+ struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+ struct bch_fs *c = thr->c;
+#if 0
+ struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
+
+ thr->thr.ret = PTR_ERR_OR_ZERO(c);
+ if (!thr->thr.ret)
+ bch2_fs_stop(c);
+#endif
+ return 0;
+}
+
+static long bch2_ioctl_fsck_online(struct bch_fs *c,
+ struct bch_ioctl_fsck_online arg)
+{
+ struct fsck_thread *thr = NULL;
+ long ret = 0;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr)
+ return -ENOMEM;
+
+ thr->c = c;
+ thr->output.buf = PRINTBUF;
+ thr->output.buf.atomic++;
+ spin_lock_init(&thr->output.lock);
+ init_waitqueue_head(&thr->output.wait);
+ darray_init(&thr->output2);
+
+ ret = run_thread_with_file(&thr->thr,
+ &fsck_thread_ops,
+ bch2_fsck_online_thread_fn,
+ "bch-fsck");
+ bch_err_fn(c, ret);
+ if (ret < 0)
+ bch2_fsck_thread_free(thr);
+ return ret;
+}
+
#define BCH_IOCTL(_name, _argtype) \
do { \
_argtype i; \
@@ -745,7 +994,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
case BCH_IOCTL_DISK_RESIZE_JOURNAL:
BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
-
+ case BCH_IOCTL_FSCK_ONLINE:
+ BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
default:
return -ENOTTY;
}
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index 51af8ea2..33df8cf8 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -572,10 +572,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
c->opts.encoded_extent_max);
- /*
- * ZSTD is lying: if we allocate the size of the workspace it says it
- * requires, it returns memory allocation errors
- */
c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
struct {
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 0d58a872..22d4bb78 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -485,7 +485,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
* we aren't using the extent overwrite path to delete, we're
* just using the normal key deletion path:
*/
- if (bkey_deleted(&n->k))
+ if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS))
n->k.size = 0;
return bch2_trans_relock(trans) ?:
@@ -605,7 +605,7 @@ int bch2_data_update_init(struct btree_trans *trans,
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
- ret = bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+ ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
goto done;
}
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index e3e2be79..87c13f13 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -174,6 +174,7 @@
x(EINVAL, insufficient_devices_to_start) \
x(EINVAL, invalid) \
x(EINVAL, internal_fsck_err) \
+ x(EINVAL, opt_parse_error) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index 5a39bcb5..561fc1da 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -287,34 +287,26 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
bch_notice(c, "shutdown by ioctl type %u", flags);
- down_write(&c->vfs_sb->s_umount);
-
switch (flags) {
case FSOP_GOING_FLAGS_DEFAULT:
ret = freeze_bdev(c->vfs_sb->s_bdev);
if (ret)
- goto err;
-
+ break;
bch2_journal_flush(&c->journal);
- c->vfs_sb->s_flags |= SB_RDONLY;
bch2_fs_emergency_read_only(c);
thaw_bdev(c->vfs_sb->s_bdev);
break;
-
case FSOP_GOING_FLAGS_LOGFLUSH:
bch2_journal_flush(&c->journal);
fallthrough;
-
case FSOP_GOING_FLAGS_NOLOGFLUSH:
- c->vfs_sb->s_flags |= SB_RDONLY;
bch2_fs_emergency_read_only(c);
break;
default:
ret = -EINVAL;
break;
}
-err:
- up_write(&c->vfs_sb->s_umount);
+
return ret;
}
@@ -341,6 +333,10 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
(arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
return -EINVAL;
+ if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+ !arg.src_ptr)
+ return -EOPNOTSUPP;
+
if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
create_flags |= BCH_CREATE_SNAPSHOT;
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index 8dd4046c..8e6f230e 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -279,14 +279,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
if (err)
prt_printf(err, "%s: not a multiple of 512",
opt->attr.name);
- return -EINVAL;
+ return -BCH_ERR_opt_parse_error;
}
if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
if (err)
prt_printf(err, "%s: must be a power of two",
opt->attr.name);
- return -EINVAL;
+ return -BCH_ERR_opt_parse_error;
}
if (opt->fn.validate)
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 8526f177..91026dfb 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -419,6 +419,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Allocate the buckets_nouse bitmap") \
+ x(log_output, u64, \
+ 0, \
+ OPT_UINT(0, S64_MAX), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Allocate the buckets_nouse bitmap") \
x(project, u8, \
OPT_INODE, \
OPT_BOOL(), \
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 98f1454c..69b49845 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -171,10 +171,12 @@ static int bch2_journal_replay(struct bch_fs *c)
struct journal_key *k = keys->d + i;
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_journal_reclaim|
- (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
+ /* Skip fastpath if we're low on space in the journal */
+ ret = c->journal.watermark ? -1 :
+ commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
bch2_journal_replay_key(trans, k));
BUG_ON(!ret && !k->overwritten);
if (ret) {
@@ -657,13 +659,13 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
struct recovery_pass_fn *p = recovery_pass_fns + pass;
if (!(p->when & PASS_SILENT))
- printk(KERN_INFO bch2_log_msg(c, "%s..."),
- bch2_recovery_passes[pass]);
+ bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
+ bch2_recovery_passes[pass]);
ret = p->fn(c);
if (ret)
return ret;
if (!(p->when & PASS_SILENT))
- printk(KERN_CONT " done\n");
+ bch2_print(c, KERN_CONT " done\n");
c->recovery_passes_complete |= BIT_ULL(pass);
}
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
index 07ddf3e8..1d56470e 100644
--- a/libbcachefs/reflink.c
+++ b/libbcachefs/reflink.c
@@ -3,6 +3,7 @@
#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
+#include "error.h"
#include "extents.h"
#include "inode.h"
#include "io_misc.h"
@@ -73,6 +74,206 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
return true;
}
+static int trans_mark_reflink_p_segment(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 *idx, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_i *k;
+ __le64 *refcount;
+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ k = bch2_bkey_get_mut_noupdate(trans, &iter,
+ BTREE_ID_reflink, POS(0, *idx),
+ BTREE_ITER_WITH_UPDATES);
+ ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ goto err;
+
+ refcount = bkey_refcount(k);
+ if (!refcount) {
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
+ "nonexistent indirect extent at %llu while marking\n %s",
+ *idx, buf.buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
+ "indirect extent refcount underflow at %llu while marking\n %s",
+ *idx, buf.buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+ u64 pad;
+
+ pad = max_t(s64, le32_to_cpu(v->front_pad),
+ le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
+ BUG_ON(pad > U32_MAX);
+ v->front_pad = cpu_to_le32(pad);
+
+ pad = max_t(s64, le32_to_cpu(v->back_pad),
+ k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
+ BUG_ON(pad > U32_MAX);
+ v->back_pad = cpu_to_le32(pad);
+ }
+
+ le64_add_cpu(refcount, add);
+
+ bch2_btree_iter_set_pos_to_extent_start(&iter);
+ ret = bch2_trans_update(trans, &iter, k, 0);
+ if (ret)
+ goto err;
+
+ *idx = k->k.p.offset;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int __trans_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
+{
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ u64 idx, end_idx;
+ int ret = 0;
+
+ idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+ end_idx = le64_to_cpu(p.v->idx) + p.k->size +
+ le32_to_cpu(p.v->back_pad);
+
+ while (idx < end_idx && !ret)
+ ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
+ return ret;
+}
+
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
+{
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
+
+ v->front_pad = v->back_pad = 0;
+ }
+
+ return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
+static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 start, u64 end,
+ u64 *idx, unsigned flags, size_t r_idx)
+{
+ struct bch_fs *c = trans->c;
+ struct reflink_gc *r;
+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ u64 next_idx = end;
+ s64 ret = 0;
+ struct printbuf buf = PRINTBUF;
+
+ if (r_idx >= c->reflink_gc_nr)
+ goto not_found;
+
+ r = genradix_ptr(&c->reflink_gc_table, r_idx);
+ next_idx = min(next_idx, r->offset - r->size);
+ if (*idx < next_idx)
+ goto not_found;
+
+ BUG_ON((s64) r->refcount + add < 0);
+
+ r->refcount += add;
+ *idx = r->offset;
+ return 0;
+not_found:
+ if (fsck_err(c, reflink_p_to_missing_reflink_v,
+ "pointer to missing indirect extent\n"
+ " %s\n"
+ " missing range %llu-%llu",
+ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+ *idx, next_idx)) {
+ struct bkey_i_error *new;
+
+ new = bch2_trans_kmalloc(trans, sizeof(*new));
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto err;
+
+ bkey_init(&new->k);
+ new->k.type = KEY_TYPE_error;
+ new->k.p = bkey_start_pos(p.k);
+ new->k.p.offset += *idx - start;
+ bch2_key_resize(&new->k, next_idx - *idx);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
+ BTREE_TRIGGER_NORUN);
+ }
+
+ *idx = next_idx;
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int __mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ struct reflink_gc *ref;
+ size_t l, r, m;
+ u64 idx = le64_to_cpu(p.v->idx), start = idx;
+ u64 end = le64_to_cpu(p.v->idx) + p.k->size;
+ int ret = 0;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) {
+ idx -= le32_to_cpu(p.v->front_pad);
+ end += le32_to_cpu(p.v->back_pad);
+ }
+
+ l = 0;
+ r = c->reflink_gc_nr;
+ while (l < r) {
+ m = l + (r - l) / 2;
+
+ ref = genradix_ptr(&c->reflink_gc_table, m);
+ if (ref->offset <= idx)
+ l = m + 1;
+ else
+ r = m;
+ }
+
+ while (idx < end && !ret)
+ ret = __bch2_mark_reflink_p(trans, p, start, end,
+ &idx, flags, l++);
+
+ return ret;
+}
+
+int bch2_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
/* indirect extents */
int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -121,6 +322,14 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans,
{
check_indirect_extent_deleting(new, &flags);
+ if (old.k->type == KEY_TYPE_reflink_v &&
+ new->k.type == KEY_TYPE_reflink_v &&
+ old.k->u64s == new->k.u64s &&
+ !memcmp(bkey_s_c_to_reflink_v(old).v->start,
+ bkey_i_to_reflink_v(new)->v.start,
+ bkey_val_bytes(&new->k) - 8))
+ return 0;
+
return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
}
diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h
index 8ccf3f9c..6cc9c4a7 100644
--- a/libbcachefs/reflink.h
+++ b/libbcachefs/reflink.h
@@ -9,6 +9,10 @@ int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \
.key_invalid = bch2_reflink_p_invalid, \
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index e7f186b4..3abccdbf 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -80,6 +80,25 @@ const char * const bch2_fs_flag_strs[] = {
NULL
};
+void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ if (likely(!c->output)) {
+ vprintk(fmt, args);
+ } else {
+ unsigned long flags;
+
+ spin_lock_irqsave(&c->output->lock, flags);
+ prt_vprintf(&c->output->buf, fmt, args);
+ spin_unlock_irqrestore(&c->output->lock, flags);
+
+ wake_up(&c->output->wait);
+ }
+ va_end(args);
+}
+
#define KTYPE(type) \
static const struct attribute_group type ## _group = { \
.attrs = type ## _files \
@@ -703,6 +722,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto out;
}
+ c->output = (void *)(unsigned long) opts.log_output;
+
__module_get(THIS_MODULE);
closure_init(&c->cl, NULL);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 4a7c93bc..1b82a3a9 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -278,8 +278,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
if (!btree_type_has_ptrs(id))
continue;
- for_each_btree_key(trans, iter, id, POS_MIN,
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ ret = for_each_btree_key2(trans, iter, id, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *entry;
@@ -305,8 +305,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
s[t].sectors_compressed += k.k->size;
s[t].sectors_uncompressed += k.k->size;
}
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
}
bch2_trans_put(trans);
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
index 6e2ad6f3..cfa7ee78 100644
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(trans_str,
__entry->trans_fn, (void *) __entry->caller_ip, __get_str(str))
);
-DECLARE_EVENT_CLASS(btree_node,
+DECLARE_EVENT_CLASS(btree_node_nofs,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b),
@@ -97,6 +97,33 @@ DECLARE_EVENT_CLASS(btree_node,
__entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
);
+DECLARE_EVENT_CLASS(btree_node,
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ __field(u8, level )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->level = b->c.level;
+ __entry->btree_id = b->c.btree_id;
+ TRACE_BPOS_assign(pos, b->key.k.p);
+ ),
+
+ TP_printk("%d,%d %s %u %s %llu:%llu:%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn,
+ __entry->level,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
DECLARE_EVENT_CLASS(bch_fs,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c),
@@ -112,6 +139,23 @@ DECLARE_EVENT_CLASS(bch_fs,
TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
);
+DECLARE_EVENT_CLASS(btree_trans,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ ),
+
+ TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn)
+);
+
DECLARE_EVENT_CLASS(bio,
TP_PROTO(struct bio *bio),
TP_ARGS(bio),
@@ -330,36 +374,36 @@ TRACE_EVENT(btree_cache_scan,
__entry->nr_to_scan, __entry->can_free, __entry->ret)
);
-DEFINE_EVENT(btree_node, btree_cache_reap,
+DEFINE_EVENT(btree_node_nofs, btree_cache_reap,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
/* Btree */
DEFINE_EVENT(btree_node, btree_node_read,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
TRACE_EVENT(btree_node_write,
@@ -383,13 +427,13 @@ TRACE_EVENT(btree_node_write,
);
DEFINE_EVENT(btree_node, btree_node_alloc,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_free,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
TRACE_EVENT(btree_reserve_get_fail,
@@ -421,28 +465,28 @@ TRACE_EVENT(btree_reserve_get_fail,
);
DEFINE_EVENT(btree_node, btree_node_compact,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_merge,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_split,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_rewrite,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_set_root,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
TRACE_EVENT(btree_path_relock_fail,