diff options
46 files changed, 1884 insertions, 443 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision index b0f07a3d..393a80ed 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -71a5b27e017df6ebae391da58857b22fdc406276 +6d44812757ddf81fad087d6abe662355e6712e02 @@ -29,7 +29,7 @@ CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC \ -D_LGPL_SOURCE \ -DRCU_MEMBARRIER \ -DZSTD_STATIC_LINKING_ONLY \ - -DFUSE_USE_VERSION=32 \ + -DFUSE_USE_VERSION=35 \ -DNO_BCACHEFS_CHARDEV \ -DNO_BCACHEFS_FS \ -DNO_BCACHEFS_SYSFS \ @@ -91,9 +91,47 @@ else ROOT_SBINDIR?=$(PREFIX)/sbin INITRAMFS_DIR=/etc/initramfs-tools endif +LIBDIR=$(PREFIX)/lib + +PKGCONFIG_SERVICEDIR:=$(shell $(PKG_CONFIG) --variable=systemdsystemunitdir systemd) +ifeq (,$(PKGCONFIG_SERVICEDIR)) + $(warning skipping systemd integration) +else +BCACHEFSCK_ARGS=-f -n +systemd_libfiles=\ + fsck/bcachefsck_fail \ + fsck/bcachefsck_all + +systemd_services=\ + fsck/bcachefsck_fail@.service \ + fsck/bcachefsck@.service \ + fsck/system-bcachefsck.slice \ + fsck/bcachefsck_all_fail.service \ + fsck/bcachefsck_all.service \ + fsck/bcachefsck_all.timer + +built_scripts+=\ + fsck/bcachefsck_fail@.service \ + fsck/bcachefsck@.service \ + fsck/bcachefsck_all_fail.service \ + fsck/bcachefsck_all \ + fsck/bcachefsck_all.service + +%.service: %.service.in + @echo " [SED] $@" + $(Q)sed -e "s|@libdir@|$(LIBDIR)|g" \ + -e "s|@bcachefsck_args@|$(BCACHEFSCK_ARGS)|g" < $< > $@ + +fsck/bcachefsck_all: fsck/bcachefsck_all.in + @echo " [SED] $@" + $(Q)sed -e "s|@bcachefsck_args@|$(BCACHEFSCK_ARGS)|g" < $< > $@ + +optional_build+=$(systemd_libfiles) $(systemd_services) +optional_install+=install_systemd +endif # PKGCONFIG_SERVICEDIR .PHONY: all -all: bcachefs +all: bcachefs $(optional_build) .PHONY: debug debug: CFLAGS+=-Werror -DCONFIG_BCACHEFS_DEBUG=y -DCONFIG_VALGRIND=y @@ -157,7 +195,7 @@ cmd_version.o : .version .PHONY: install install: INITRAMFS_HOOK=$(INITRAMFS_DIR)/hooks/bcachefs install: INITRAMFS_SCRIPT=$(INITRAMFS_DIR)/scripts/local-premount/bcachefs -install: bcachefs +install: bcachefs $(optional_install) $(INSTALL) -m0755 -D bcachefs -t $(DESTDIR)$(ROOT_SBINDIR) $(INSTALL) -m0644 -D bcachefs.8 -t $(DESTDIR)$(PREFIX)/share/man/man8/ $(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_SCRIPT) @@ -173,11 +211,17 @@ install: bcachefs sed -i '/^# Note: make install replaces/,$$d' $(DESTDIR)$(INITRAMFS_HOOK) echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK) +.PHONY: install_systemd +install_systemd: $(systemd_services) $(systemd_libfiles) + $(INSTALL) -m0755 -D $(systemd_libfiles) -t $(DESTDIR)$(LIBDIR) + $(INSTALL) -m0644 -D $(systemd_services) -t $(DESTDIR)$(PKGCONFIG_SERVICEDIR) + .PHONY: clean clean: @echo "Cleaning all" $(Q)$(RM) bcachefs libbcachefs.a tests/test_helper .version *.tar.xz $(OBJS) $(DEPS) $(DOCGENERATED) $(Q)$(RM) -rf rust-src/*/target + $(Q)$(RM) -f $(built_scripts) .PHONY: deb deb: all @@ -224,7 +268,7 @@ update-bcachefs-sources: git add include/linux/kmemleak.h cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/ git add linux/int_sqrt.c - rm libbcachefs/mean_and_variance_test.c + git rm libbcachefs/mean_and_variance_test.c # cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/ # git add linux/mean_and_variance.c # cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/ @@ -39,10 +39,22 @@ static void dev_usage_type_to_text(struct printbuf *out, struct bch_ioctl_dev_usage_v2 *u, enum bch_data_type type) { + u64 sectors = 0; + switch (type) { + case BCH_DATA_free: + case BCH_DATA_need_discard: + case BCH_DATA_need_gc_gens: + /* sectors are 0 for these types so calculate sectors for them */ + sectors = u->d[type].buckets * u->bucket_size; + break; + default: + sectors = u->d[type].sectors; + } + __dev_usage_type_to_text(out, bch2_data_types[type], u->bucket_size, u->d[type].buckets, - u->d[type].sectors, + sectors, u->d[type].fragmented); } @@ -1,5 +1,7 @@ #include <getopt.h> +#include <sys/uio.h> +#include <unistd.h> #include "cmds.h" #include "libbcachefs/error.h" #include "libbcachefs.h" @@ -23,6 +25,62 @@ static void usage(void) "Report bugs to <linux-bcachefs@vger.kernel.org>"); } +static void setnonblocking(int fd) +{ + int flags = fcntl(fd, F_GETFL); + if (fcntl(fd, F_SETFL, flags|O_NONBLOCK)) + die("fcntl error: %m"); +} + +static int do_splice(int rfd, int wfd) +{ + char buf[4096]; + + int r = read(rfd, buf, sizeof(buf)); + if (r < 0 && errno == EAGAIN) + return 0; + if (r < 0) + return r; + if (!r) + return 1; + if (write(wfd, buf, r) != r) + die("write error"); + return 0; +} + +static int fsck_online(const char *dev_path) +{ + int dev_idx; + struct bchfs_handle fs = bchu_fs_open_by_dev(dev_path, &dev_idx); + + struct bch_ioctl_fsck_online fsck = { 0 }; + + int fsck_fd = ioctl(fs.ioctl_fd, BCH_IOCTL_FSCK_ONLINE, &fsck); + if (fsck_fd < 0) + die("BCH_IOCTL_FSCK_ONLINE error: %s", bch2_err_str(fsck_fd)); + + setnonblocking(STDIN_FILENO); + setnonblocking(fsck_fd); + + while (true) { + fd_set fds; + + FD_ZERO(&fds); + FD_SET(STDIN_FILENO, &fds); + FD_SET(fsck_fd, &fds); + + select(fsck_fd + 1, &fds, NULL, NULL, NULL); + + int r = do_splice(fsck_fd, STDOUT_FILENO) ?: + do_splice(STDIN_FILENO, fsck_fd); + if (r) + return r < 0 ? r : 0; + } + + pr_info("done"); + return 0; +} + int cmd_fsck(int argc, char *argv[]) { static const struct option longopts[] = { @@ -80,16 +138,9 @@ int cmd_fsck(int argc, char *argv[]) exit(8); } - for (i = 0; i < argc; i++) { - switch (dev_mounted(argv[i])) { - case 1: - ret |= 2; - break; - case 2: - fprintf(stderr, "%s is mounted read-write - aborting\n", argv[i]); - exit(8); - } - } + for (i = 0; i < argc; i++) + if (dev_mounted(argv[i])) + return fsck_online(argv[i]); struct bch_fs *c = bch2_fs_open(argv, argc, opts); if (IS_ERR(c)) { diff --git a/cmd_fusemount.c b/cmd_fusemount.c index a09d296c..d81f3188 100644 --- a/cmd_fusemount.c +++ b/cmd_fusemount.c @@ -34,6 +34,15 @@ /* XXX cut and pasted from fsck.c */ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } +/* used by write_aligned function for waiting on bch2_write closure */ +struct write_aligned_op_t { + struct closure cl; + + /* must be last: */ + struct bch_write_op op; +}; + + static inline subvol_inum map_root_ino(u64 ino) { return (subvol_inum) { 1, ino == 1 ? 4096 : ino }; @@ -343,7 +352,7 @@ static void bcachefs_fuse_link(fuse_req_t req, fuse_ino_t ino, int ret; fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_link(%llu, %llu, %s)\n", - inum, newparent.inum, newname); + inum.inum, newparent.inum, newname); ret = bch2_trans_do(c, NULL, NULL, 0, bch2_link_trans(trans, newparent, &dir_u, @@ -392,6 +401,14 @@ static void bcachefs_fuse_read_endio(struct bio *bio) closure_put(bio->bi_private); } + +static void bcachefs_fuse_write_endio(struct bch_write_op *op) +{ + struct write_aligned_op_t *w = container_of(op,struct write_aligned_op_t,op); + closure_put(&w->cl); +} + + struct fuse_align_io { off_t start; size_t pad_start; @@ -554,41 +571,47 @@ static int write_aligned(struct bch_fs *c, subvol_inum inum, size_t aligned_size, off_t aligned_offset, off_t new_i_size, size_t *written_out) { - struct bch_write_op op = { 0 }; + + struct write_aligned_op_t w = { 0 } +; + struct bch_write_op *op = &w.op; struct bio_vec bv; - struct closure cl; BUG_ON(aligned_size & (block_bytes(c) - 1)); BUG_ON(aligned_offset & (block_bytes(c) - 1)); *written_out = 0; - closure_init_stack(&cl); + closure_init_stack(&w.cl); - bch2_write_op_init(&op, c, io_opts); /* XXX reads from op?! */ - op.write_point = writepoint_hashed(0); - op.nr_replicas = io_opts.data_replicas; - op.target = io_opts.foreground_target; - op.subvol = inum.subvol; - op.pos = POS(inum.inum, aligned_offset >> 9); - op.new_i_size = new_i_size; + bch2_write_op_init(op, c, io_opts); /* XXX reads from op?! */ + op->write_point = writepoint_hashed(0); + op->nr_replicas = io_opts.data_replicas; + op->target = io_opts.foreground_target; + op->subvol = inum.subvol; + op->pos = POS(inum.inum, aligned_offset >> 9); + op->new_i_size = new_i_size; + op->end_io = bcachefs_fuse_write_endio; - userbio_init(&op.wbio.bio, &bv, buf, aligned_size); - bio_set_op_attrs(&op.wbio.bio, REQ_OP_WRITE, REQ_SYNC); + userbio_init(&op->wbio.bio, &bv, buf, aligned_size); + bio_set_op_attrs(&op->wbio.bio, REQ_OP_WRITE, REQ_SYNC); - if (bch2_disk_reservation_get(c, &op.res, aligned_size >> 9, - op.nr_replicas, 0)) { + if (bch2_disk_reservation_get(c, &op->res, aligned_size >> 9, + op->nr_replicas, 0)) { /* XXX: use check_range_allocated like dio write path */ return -ENOSPC; } - closure_call(&op.cl, bch2_write, NULL, &cl); - closure_sync(&cl); + closure_get(&w.cl); - if (!op.error) - *written_out = op.written << 9; + closure_call(&op->cl, bch2_write, NULL, NULL); - return op.error; + closure_sync(&w.cl); + + if (!op->error) + *written_out = op->written << 9; + + return op->error; } static void bcachefs_fuse_write(fuse_req_t req, fuse_ino_t ino, @@ -1255,6 +1278,11 @@ int cmd_fusemount(int argc, char *argv[]) /* This print statement is a trigger for tests. */ printf("Fuse mount initialized.\n"); + if (fuse_opts.foreground == 0){ + printf("Fuse forcing to foreground mode, due gcc constructors usage.\n"); + fuse_opts.foreground = 1; + } + fuse_daemonize(fuse_opts.foreground); ret = fuse_session_loop(se); diff --git a/debian/bcachefs-tools.postinst b/debian/bcachefs-tools.postinst index 483b9619..56dd8905 100644 --- a/debian/bcachefs-tools.postinst +++ b/debian/bcachefs-tools.postinst @@ -2,6 +2,8 @@ set -e +#DEBHELPER# + case "$1" in configure) if which update-initramfs >/dev/null; then diff --git a/debian/bcachefs-tools.postrm b/debian/bcachefs-tools.postrm index 6b6fe8ac..2d913367 100644 --- a/debian/bcachefs-tools.postrm +++ b/debian/bcachefs-tools.postrm @@ -2,6 +2,8 @@ set -e +#DEBHELPER# + case "$1" in remove) if which update-initramfs >/dev/null; then diff --git a/debian/control b/debian/control index 2e7c86b2..e68bd7ab 100644 --- a/debian/control +++ b/debian/control @@ -6,12 +6,12 @@ Standards-Version: 3.9.5 Build-Depends: debhelper (>= 9), pkg-config, libaio-dev, libblkid-dev, libkeyutils-dev, liblz4-dev, libsodium-dev, liburcu-dev, libudev-dev, libzstd-dev, uuid-dev, zlib1g-dev, python3, python3-docutils, - rustc, cargo, llvm, clang, libclang-dev + rustc, cargo, llvm, clang, libclang-dev, systemd Homepage: https://bcachefs.org/ Package: bcachefs-tools Architecture: linux-any -Depends: ${shlibs:Depends}, ${misc:Depends} +Depends: ${shlibs:Depends}, ${misc:Depends}, ${python3:Depends} Recommends: initramfs-tools | linux-initramfs-tool Description: bcachefs userspace tools Userspace tools for bcachefs, a modern copy on write, checksumming, multi diff --git a/debian/rules b/debian/rules index ae98f5ce..7a713efb 100755 --- a/debian/rules +++ b/debian/rules @@ -3,7 +3,7 @@ PREFIX := /usr %: - dh $@ + dh --with python3 $@ override_dh_auto_install: dh_auto_install -- "PREFIX=$(PREFIX)" diff --git a/fsck/.gitignore b/fsck/.gitignore new file mode 100644 index 00000000..0e3ad1b0 --- /dev/null +++ b/fsck/.gitignore @@ -0,0 +1 @@ +*.service diff --git a/fsck/bcachefsck@.service.in b/fsck/bcachefsck@.service.in new file mode 100644 index 00000000..86c1824c --- /dev/null +++ b/fsck/bcachefsck@.service.in @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2023-2024 Oracle. All Rights Reserved. +# Author: Darrick J. Wong <djwong@kernel.org> + +[Unit] +Description=Online bcachefsck for %f +OnFailure=bcachefsck@%i.service +Documentation=man:bcachefs(8) + +# Explicitly require the capabilities that this program needs +ConditionCapability=CAP_SYS_ADMIN +ConditionCapability=CAP_FOWNER +ConditionCapability=CAP_DAC_OVERRIDE +ConditionCapability=CAP_DAC_READ_SEARCH +ConditionCapability=CAP_SYS_RAWIO + +# Must be a mountpoint +ConditionPathIsMountPoint=%f +RequiresMountsFor=%f + +[Service] +Type=oneshot +Environment=SERVICE_MODE=1 +ExecStart=bcachefs fsck --real-mountpoint /tmp/scrub/ @bcachefsck_args@ %f +SyslogIdentifier=%N + +# Run scrub with minimal CPU and IO priority so that nothing else will starve. +IOSchedulingClass=idle +CPUSchedulingPolicy=idle +CPUAccounting=true +Nice=19 + +# Create the service underneath the background service slice so that we can +# control resource usage. +Slice=system-bcachefsck.slice + +# No realtime CPU scheduling +RestrictRealtime=true + +# Dynamically create a user that isn't root +DynamicUser=true + +# Make the entire filesystem readonly and /home inaccessible, then bind mount +# the filesystem we're supposed to be checking into our private /tmp dir. +# 'norbind' means that we don't bind anything under that original mount. +# This enables checking filesystems mounted under /tmp in the global mount +# namespace. +ProtectSystem=strict +ProtectHome=yes +PrivateTmp=true +BindPaths=%f:/tmp/scrub:norbind + +# No network access +PrivateNetwork=true +ProtectHostname=true +RestrictAddressFamilies=none +IPAddressDeny=any + +# Don't let the program mess with the kernel configuration at all +ProtectKernelLogs=true +ProtectKernelModules=true +ProtectKernelTunables=true +ProtectControlGroups=true +ProtectProc=invisible +RestrictNamespaces=true + +# Hide everything in /proc, even /proc/mounts +ProcSubset=pid + +# Only allow the default personality Linux +LockPersonality=true + +# No writable memory pages +MemoryDenyWriteExecute=true + +# Don't let our mounts leak out to the host +PrivateMounts=true + +# Restrict system calls to the native arch and only enough to get things going +SystemCallArchitectures=native +SystemCallFilter=@system-service +SystemCallFilter=~@privileged +SystemCallFilter=~@resources +SystemCallFilter=~@mount + +# bcachefsck needs these privileges to run, and no others +CapabilityBoundingSet=CAP_SYS_ADMIN CAP_FOWNER CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_SYS_RAWIO +AmbientCapabilities=CAP_SYS_ADMIN CAP_FOWNER CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_SYS_RAWIO +NoNewPrivileges=true + +# bcachefsck doesn't create files +UMask=7777 + +# No access to hardware /dev files except for block devices +ProtectClock=true +DevicePolicy=closed +DeviceAllow=block-* diff --git a/fsck/bcachefsck_all.in b/fsck/bcachefsck_all.in new file mode 100644 index 00000000..4f6031eb --- /dev/null +++ b/fsck/bcachefsck_all.in @@ -0,0 +1,481 @@ +#!/usr/bin/python3 + +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (C) 2023-2024 Oracle. All rights reserved. +# +# Author: Darrick J. Wong <djwong@kernel.org> + +# Run bcachefsck in parallel, but avoid thrashing. + +import subprocess +import json +import threading +import time +import sys +import os +import argparse +import signal +import dbus +from io import TextIOWrapper +from pathlib import Path +from datetime import timedelta +from datetime import datetime +from datetime import timezone + +retcode = 0 +terminate = False +debug = False + +def DEVNULL(): + '''Return /dev/null in subprocess writable format.''' + try: + from subprocess import DEVNULL + return DEVNULL + except ImportError: + return open(os.devnull, 'wb') + +def find_mounts(): + '''Map mountpoints to physical disks.''' + def find_bcachefs_mounts(bdev, fs, lastdisk): + '''Attach all lastdisk to each fs found under bdev.''' + if bdev['fstype'] == 'bcachefs' and bdev['mountpoint'] is not None: + mnt = bdev['mountpoint'] + if mnt in fs: + fs[mnt].add(lastdisk.split(':')) + else: + fs[mnt] = set(lastdisk.split(':')) + if 'children' not in bdev: + return + for child in bdev['children']: + find_bcachefs_mounts(child, fs, lastdisk) + + fs = {} + cmd=['lsblk', '-o', 'NAME,KNAME,TYPE,FSTYPE,MOUNTPOINT', '-J'] + result = subprocess.Popen(cmd, stdout=subprocess.PIPE) + result.wait() + if result.returncode != 0: + return fs + sarray = [x.decode(sys.stdout.encoding) for x in result.stdout.readlines()] + output = ' '.join(sarray) + bdevdata = json.loads(output) + + # The lsblk output had better be in disks-then-partitions order + for bdev in bdevdata['blockdevices']: + lastdisk = bdev['kname'] + find_bcachefs_mounts(bdev, fs, lastdisk) + + return fs + +def backtick(cmd): + '''Generator function that yields lines of a program's stdout.''' + p = subprocess.Popen(cmd, stdout = subprocess.PIPE) + for line in TextIOWrapper(p.stdout, encoding="utf-8"): + yield line.strip() + +def remove_killfunc(killfuncs, fn): + '''Ensure fn is not in killfuncs.''' + try: + killfuncs.remove(fn) + except: + pass + +class scrub_control(object): + '''Control object for bcachefsck.''' + def __init__(self): + pass + + def start(self): + '''Start scrub and wait for it to complete. Returns -1 if the + service was not started, 0 if it succeeded, or 1 if it + failed.''' + assert False + + def stop(self): + '''Stop scrub.''' + assert False + +class scrub_subprocess(scrub_control): + '''Control object for bcachefsck subprocesses.''' + def __init__(self, mnt): + cmd = ['bcachefs', 'fsck'] + cmd += '@bcachefsck_args@'.split() + cmd += [mnt] + self.cmdline = cmd + self.proc = None + + def start(self): + '''Start bcachefsck and wait for it to complete. Returns -1 if + the service was not started, 0 if it succeeded, or 1 if it + failed.''' + global debug + + if debug: + print('run ', ' '.join(self.cmdline)) + + try: + self.proc = subprocess.Popen(self.cmdline) + self.proc.wait() + except: + return -1 + + proc = self.proc + self.proc = None + return proc.returncode + + def stop(self): + '''Stop bcachefsck.''' + global debug + + if debug: + print('kill ', ' '.join(self.cmdline)) + if self.proc is not None: + self.proc.terminate() + +def run_subprocess(mnt, killfuncs): + '''Run a killable program. Returns program retcode or -1 if we can't + start it.''' + try: + p = scrub_subprocess(mnt) + killfuncs.add(p.stop) + ret = p.start() + remove_killfunc(killfuncs, p.stop) + return ret + except: + return -1 + +# systemd doesn't like unit instance names with slashes in them, so it +# replaces them with dashes when it invokes the service. Filesystem paths +# need a special --path argument so that dashes do not get mangled. +def path_to_serviceunit(path): + '''Convert a pathname into a systemd service unit name.''' + + svcname = 'bcachefsck@.service' + cmd = ['systemd-escape', '--template', svcname, '--path', path] + + proc = subprocess.Popen(cmd, stdout = subprocess.PIPE) + proc.wait() + for line in proc.stdout: + return line.decode(sys.stdout.encoding).strip() + +def fibonacci(max_ret): + '''Yield fibonacci sequence up to but not including max_ret.''' + if max_ret < 1: + return + + x = 0 + y = 1 + yield 1 + + z = x + y + while z <= max_ret: + yield z + x = y + y = z + z = x + y + +class scrub_service(scrub_control): + '''Control object for bcachefsck systemd service.''' + def __init__(self, mnt): + self.unitname = path_to_serviceunit(mnt) + self.prop = None + self.unit = None + self.bind() + + def bind(self): + '''Bind to the dbus proxy object for this service.''' + sysbus = dbus.SystemBus() + systemd1 = sysbus.get_object('org.freedesktop.systemd1', + '/org/freedesktop/systemd1') + manager = dbus.Interface(systemd1, + 'org.freedesktop.systemd1.Manager') + path = manager.LoadUnit(self.unitname) + + svc_obj = sysbus.get_object('org.freedesktop.systemd1', path) + self.prop = dbus.Interface(svc_obj, + 'org.freedesktop.DBus.Properties') + self.unit = dbus.Interface(svc_obj, + 'org.freedesktop.systemd1.Unit') + + def __dbusrun(self, lambda_fn): + '''Call the lambda function to execute something on dbus. dbus + exceptions result in retries with Fibonacci backoff, and the + bindings will be rebuilt every time.''' + global debug + + fatal_ex = None + + for i in fibonacci(30): + try: + return lambda_fn() + except dbus.exceptions.DBusException as e: + if debug: + print(e) + fatal_ex = e + time.sleep(i) + self.bind() + raise fatal_ex + + def state(self): + '''Retrieve the active state for a systemd service. As of + systemd 249, this is supposed to be one of the following: + "active", "reloading", "inactive", "failed", "activating", + or "deactivating". These strings are not localized.''' + global debug + + l = lambda: self.prop.Get('org.freedesktop.systemd1.Unit', + 'ActiveState') + try: + return self.__dbusrun(l) + except Exception as e: + if debug: + print(e, file = sys.stderr) + return 'failed' + + def wait(self, interval = 1): + '''Wait until the service finishes.''' + global debug + + # Use a poll/sleep loop to wait for the service to finish. + # Avoid adding a dependency on python3 glib, which is required + # to use an event loop to receive a dbus signal. + s = self.state() + while s not in ['failed', 'inactive']: + if debug: + print('waiting %s %s' % (self.unitname, s)) + time.sleep(interval) + s = self.state() + if debug: + print('waited %s %s' % (self.unitname, s)) + if s == 'failed': + return 1 + return 0 + + def start(self): + '''Start the service and wait for it to complete. Returns -1 + if the service was not started, 0 if it succeeded, or 1 if it + failed.''' + global debug + + if debug: + print('starting %s' % self.unitname) + + try: + self.__dbusrun(lambda: self.unit.Start('replace')) + return self.wait() + except Exception as e: + print(e, file = sys.stderr) + return -1 + + def stop(self): + '''Stop the service.''' + global debug + + if debug: + print('stopping %s' % self.unitname) + + try: + self.__dbusrun(lambda: self.unit.Stop('replace')) + return self.wait() + except Exception as e: + print(e, file = sys.stderr) + return -1 + +def run_service(mnt, killfuncs): + '''Run scrub as a service.''' + try: + svc = scrub_service(mnt) + except: + return -1 + + killfuncs.add(svc.stop) + retcode = svc.start() + remove_killfunc(killfuncs, svc.stop) + return retcode + +def run_scrub(mnt, cond, running_devs, mntdevs, killfuncs): + '''Run a scrub process.''' + global retcode, terminate + + print("Scrubbing %s..." % mnt) + sys.stdout.flush() + + try: + if terminate: + return + + # Run per-mount systemd bcachefsck service only if we ourselves + # are running as a systemd service. + if 'SERVICE_MODE' in os.environ: + ret = run_service(mnt, killfuncs) + if ret == 0 or ret == 1: + print("Scrubbing %s done, (err=%d)" % (mnt, ret)) + sys.stdout.flush() + retcode |= ret + return + + if terminate: + return + + # Invoke bcachefsck manually if we're running in the foreground. + # We also permit this if we're running as a cronjob where + # systemd services are unavailable. + ret = run_subprocess(mnt, killfuncs) + if ret >= 0: + print("Scrubbing %s done, (err=%d)" % (mnt, ret)) + sys.stdout.flush() + retcode |= ret + return + + if terminate: + return + + print("Unable to start scrub tool.") + sys.stdout.flush() + finally: + running_devs -= mntdevs + cond.acquire() + cond.notify() + cond.release() + +def signal_scrubs(signum, cond): + '''Handle termination signals by killing bcachefsck children.''' + global debug, terminate + + if debug: + print('Signal handler called with signal', signum) + sys.stdout.flush() + + terminate = True + cond.acquire() + cond.notify() + cond.release() + +def wait_for_termination(cond, killfuncs): + '''Wait for a child thread to terminate. Returns True if we should + abort the program, False otherwise.''' + global debug, terminate + + if debug: + print('waiting for threads to terminate') + sys.stdout.flush() + + cond.acquire() + try: + cond.wait() + except KeyboardInterrupt: + terminate = True + cond.release() + + if not terminate: + return False + + print("Terminating...") + sys.stdout.flush() + while len(killfuncs) > 0: + fn = killfuncs.pop() + fn() + return True + +def scan_interval(string): + '''Convert a textual scan interval argument into a time delta.''' + + if string.endswith('y'): + year = timedelta(seconds = 31556952) + return year * float(string[:-1]) + if string.endswith('q'): + return timedelta(days = 90 * float(string[:-1])) + if string.endswith('mo'): + return timedelta(days = 30 * float(string[:-2])) + if string.endswith('w'): + return timedelta(weeks = float(string[:-1])) + if string.endswith('d'): + return timedelta(days = float(string[:-1])) + if string.endswith('h'): + return timedelta(hours = float(string[:-1])) + if string.endswith('m'): + return timedelta(minutes = float(string[:-1])) + if string.endswith('s'): + return timedelta(seconds = float(string[:-1])) + return timedelta(seconds = int(string)) + +def utcnow(): + '''Create a representation of the time right now, in UTC.''' + + dt = datetime.utcnow() + return dt.replace(tzinfo = timezone.utc) + +def main(): + '''Find mounts, schedule bcachefsck runs.''' + def thr(mnt, devs): + a = (mnt, cond, running_devs, devs, killfuncs) + thr = threading.Thread(target = run_scrub, args = a) + thr.start() + global retcode, terminate, debug + + parser = argparse.ArgumentParser( \ + description = "Scrub all mounted bcachefs filesystems.") + parser.add_argument("--debug", help = "Enabling debugging messages.", \ + action = "store_true") + args = parser.parse_args() + + if args.debug: + debug = True + + fs = find_mounts() + + # Schedule scrub jobs... + running_devs = set() + killfuncs = set() + cond = threading.Condition() + + signal.signal(signal.SIGINT, lambda s, f: signal_scrubs(s, cond)) + signal.signal(signal.SIGTERM, lambda s, f: signal_scrubs(s, cond)) + + while len(fs) > 0: + if len(running_devs) == 0: + mnt, devs = fs.popitem() + running_devs.update(devs) + thr(mnt, devs) + poppers = set() + for mnt in fs: + devs = fs[mnt] + can_run = True + for dev in devs: + if dev in running_devs: + can_run = False + break + if can_run: + running_devs.update(devs) + poppers.add(mnt) + thr(mnt, devs) + for p in poppers: + fs.pop(p) + + # Wait for one thread to finish + if wait_for_termination(cond, killfuncs): + break + + # Wait for the rest of the threads to finish + while len(killfuncs) > 0: + wait_for_termination(cond, killfuncs) + + # If we're being run as a service, the return code must fit the LSB + # init script action error guidelines, which is to say that we compress + # all errors to 1 ("generic or unspecified error", LSB 5.0 section + # 22.2) and hope the admin will scan the log for what actually + # happened. + # + # We have to sleep 2 seconds here because journald uses the pid to + # connect our log messages to the systemd service. This is critical + # for capturing all the log messages if the scrub fails, because the + # fail service uses the service name to gather log messages for the + # error report. + if 'SERVICE_MODE' in os.environ: + time.sleep(2) + if retcode != 0: + retcode = 1 + + sys.exit(retcode) + +if __name__ == '__main__': + main() diff --git a/fsck/bcachefsck_all.service.in b/fsck/bcachefsck_all.service.in new file mode 100644 index 00000000..f465473d --- /dev/null +++ b/fsck/bcachefsck_all.service.in @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2023-2024 Oracle. All Rights Reserved. +# Author: Darrick J. Wong <djwong@kernel.org> + +[Unit] +Description=Online bcachefsck for All Filesystems +OnFailure=bcachefsck_all_fail.service +ConditionACPower=true +Documentation=man:bcachefsck_all(8) +After=paths.target multi-user.target network.target network-online.target systemd-networkd.service NetworkManager.service connman.service + +[Service] +Type=oneshot +Environment=SERVICE_MODE=1 +ExecStart=bcachefsck_all +SyslogIdentifier=bcachefsck_all + +# Create the service underneath the scrub background service slice so that we +# can control resource usage. +Slice=system-bcachefsck.slice + +# Run scrub_all with minimal CPU and IO priority so that nothing will starve. +IOSchedulingClass=idle +CPUSchedulingPolicy=idle +CPUAccounting=true +Nice=19 + +# No realtime scheduling +RestrictRealtime=true + +# No special privileges, but we still have to run as root so that we can +# contact the service manager to start the sub-units. +CapabilityBoundingSet= +NoNewPrivileges=true +RestrictSUIDSGID=true + +# Make the entire filesystem readonly except for the media scan stamp file +# directory. We don't want to hide anything because we need to find all +# mounted bcachefs filesystems in the host. +ProtectSystem=strict +ProtectHome=read-only +PrivateTmp=false + +# No network access except to the systemd control socket +PrivateNetwork=true +ProtectHostname=true +RestrictAddressFamilies=AF_UNIX +IPAddressDeny=any + +# Don't let the program mess with the kernel configuration at all +ProtectKernelLogs=true +ProtectKernelModules=true +ProtectKernelTunables=true +ProtectControlGroups=true +ProtectProc=invisible +RestrictNamespaces=true + +# Hide everything in /proc, even /proc/mounts +ProcSubset=pid + +# Only allow the default personality Linux +LockPersonality=true + +# No writable memory pages +MemoryDenyWriteExecute=true + +# Don't let our mounts leak out to the host +PrivateMounts=true + +# Restrict system calls to the native arch and only enough to get things going +SystemCallArchitectures=native +SystemCallFilter=@system-service +SystemCallFilter=~@privileged +SystemCallFilter=~@resources +SystemCallFilter=~@mount + +# Media scan stamp file shouldn't be readable by regular users +UMask=0077 + +# lsblk ignores mountpoints if it can't find the device files, so we cannot +# hide them +#ProtectClock=true +#PrivateDevices=true diff --git a/fsck/bcachefsck_all.timer b/fsck/bcachefsck_all.timer new file mode 100644 index 00000000..65470d40 --- /dev/null +++ b/fsck/bcachefsck_all.timer @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2023-2024 Oracle. All Rights Reserved. +# Author: Darrick J. Wong <djwong@kernel.org> + +[Unit] +Description=Periodic bcachefsck for All Filesystems + +[Timer] +# Run on Sunday at 3:10am, to avoid running afoul of DST changes +OnCalendar=Sun *-*-* 03:10:00 +RandomizedDelaySec=60 +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/fsck/bcachefsck_all_fail.service.in b/fsck/bcachefsck_all_fail.service.in new file mode 100644 index 00000000..b79f8196 --- /dev/null +++ b/fsck/bcachefsck_all_fail.service.in @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2023-2024 Oracle. All Rights Reserved. +# Author: Darrick J. Wong <djwong@kernel.org> + +[Unit] +Description=Online bcachefsck for All Filesystems Failure Reporting +Documentation=man:bcachefsck_all(8) + +[Service] +Type=oneshot +Environment=EMAIL_ADDR=root +ExecStart=@libdir@/bcachefsck_fail "${EMAIL_ADDR}" bcachefsck_all +User=mail +Group=mail +SupplementaryGroups=systemd-journal + +# No realtime scheduling +RestrictRealtime=true + +# Make the entire filesystem readonly and /home inaccessible. +ProtectSystem=full +ProtectHome=yes +PrivateTmp=true +RestrictSUIDSGID=true + +# Emailing reports requires network access, but not the ability to change the +# hostname. +ProtectHostname=true + +# Don't let the program mess with the kernel configuration at all +ProtectKernelLogs=true +ProtectKernelModules=true +ProtectKernelTunables=true +ProtectControlGroups=true +ProtectProc=invisible +RestrictNamespaces=true + +# Can't hide /proc because journalctl needs it to find various pieces of log +# information +#ProcSubset=pid + +# Only allow the default personality Linux +LockPersonality=true + +# No writable memory pages +MemoryDenyWriteExecute=true + +# Don't let our mounts leak out to the host +PrivateMounts=true + +# Restrict system calls to the native arch and only enough to get things going +SystemCallArchitectures=native +SystemCallFilter=@system-service +SystemCallFilter=~@privileged +SystemCallFilter=~@resources +SystemCallFilter=~@mount + +# xfs_scrub needs these privileges to run, and no others +CapabilityBoundingSet= +NoNewPrivileges=true + +# Failure reporting shouldn't create world-readable files +UMask=0077 + +# Clean up any IPC objects when this unit stops +RemoveIPC=true + +# No access to hardware device files +PrivateDevices=true +ProtectClock=true diff --git a/fsck/bcachefsck_fail b/fsck/bcachefsck_fail new file mode 100755 index 00000000..283cee70 --- /dev/null +++ b/fsck/bcachefsck_fail @@ -0,0 +1,63 @@ +#!/bin/bash + +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2023-2024 Oracle. All Rights Reserved. +# Author: Darrick J. Wong <djwong@kernel.org> + +# Email logs of failed bcachefsck and bcachefsck_all unit runs + +recipient="$1" +test -z "${recipient}" && exit 0 +service="$2" +test -z "${service}" && exit 0 +mntpoint="$3" + +hostname="$(hostname -f 2>/dev/null)" +test -z "${hostname}" && hostname="${HOSTNAME}" + +mailer="$(command -v sendmail)" +if [ ! -x "${mailer}" ]; then + echo "${mailer}: Mailer program not found." + exit 1 +fi + +fail_mail_mntpoint() { + local scrub_svc + + # Turn the mountpoint into a properly escaped systemd instance name + scrub_svc="$(systemd-escape --template "${service}@.service" --path "${mntpoint}")" + cat << ENDL +To: ${recipient} +From: <${service}@${hostname}> +Subject: ${service} failure on ${mntpoint} +Content-Transfer-Encoding: 8bit +Content-Type: text/plain; charset=UTF-8 + +So sorry, the automatic ${service} of ${mntpoint} on ${hostname} failed. +Please do not reply to this mesage. + +A log of what happened follows: +ENDL + systemctl status --full --lines 4294967295 "${scrub_svc}" +} + +fail_mail() { + cat << ENDL +To: ${recipient} +From: <${service}@${hostname}> +Subject: ${service} failure + +So sorry, the automatic ${service} on ${hostname} failed. + +A log of what happened follows: +ENDL + systemctl status --full --lines 4294967295 "${service}" +} + +if [ -n "${mntpoint}" ]; then + fail_mail_mntpoint | "${mailer}" -t -i +else + fail_mail | "${mailer}" -t -i +fi +exit "${PIPESTATUS[1]}" diff --git a/fsck/bcachefsck_fail@.service.in b/fsck/bcachefsck_fail@.service.in new file mode 100644 index 00000000..369a809a --- /dev/null +++ b/fsck/bcachefsck_fail@.service.in @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2023-2024 Oracle. All Rights Reserved. +# Author: Darrick J. Wong <djwong@kernel.org> + +[Unit] +Description=Online bcachefsck Failure Reporting for %f +Documentation=man:bcachefs(8) + +[Service] +Type=oneshot +Environment=EMAIL_ADDR=root +ExecStart=@libdir@/bcachefsck_fail "${EMAIL_ADDR}" bcachefs %f +User=mail +Group=mail +SupplementaryGroups=systemd-journal + +# Create the service underneath the background service slice so that we can +# control resource usage. +Slice=system-bcachefsck.slice + +# No realtime scheduling +RestrictRealtime=true + +# Make the entire filesystem readonly and /home inaccessible. +ProtectSystem=full +ProtectHome=yes +PrivateTmp=true +RestrictSUIDSGID=true + +# Emailing reports requires network access, but not the ability to change the +# hostname. +ProtectHostname=true + +# Don't let the program mess with the kernel configuration at all +ProtectKernelLogs=true +ProtectKernelModules=true +ProtectKernelTunables=true +ProtectControlGroups=true +ProtectProc=invisible +RestrictNamespaces=true + +# Can't hide /proc because journalctl needs it to find various pieces of log +# information +#ProcSubset=pid + +# Only allow the default personality Linux +LockPersonality=true + +# No writable memory pages +MemoryDenyWriteExecute=true + +# Don't let our mounts leak out to the host +PrivateMounts=true + +# Restrict system calls to the native arch and only enough to get things going +SystemCallArchitectures=native +SystemCallFilter=@system-service +SystemCallFilter=~@privileged +SystemCallFilter=~@resources +SystemCallFilter=~@mount + +# xfs_scrub needs these privileges to run, and no others +CapabilityBoundingSet= +NoNewPrivileges=true + +# Failure reporting shouldn't create world-readable files +UMask=0077 + +# Clean up any IPC objects when this unit stops +RemoveIPC=true + +# No access to hardware device files +PrivateDevices=true +ProtectClock=true diff --git a/fsck/system-bcachefsck.slice b/fsck/system-bcachefsck.slice new file mode 100644 index 00000000..ea368032 --- /dev/null +++ b/fsck/system-bcachefsck.slice @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2023-2024 Oracle. All Rights Reserved. +# Author: Darrick J. Wong <djwong@kernel.org> + +[Unit] +Description=bcachefsck background service slice +Before=slices.target + +[Slice] + +# If the CPU usage cgroup controller is available, don't use more than 60% of a +# single core for all background processes. +CPUQuota=60% +CPUAccounting=true + +[Install] +# As of systemd 249, the systemd cgroupv2 configuration code will drop resource +# controllers from the root and system.slice cgroups at startup if it doesn't +# find any direct dependencies that require a given controller. Newly +# activated units with resource control directives are created under the system +# slice but do not cause a reconfiguration of the slice's resource controllers. +# Hence we cannot put CPUQuota= into the bcachefsck service units directly. +# +# For the CPUQuota directive to have any effect, we must therefore create an +# explicit definition file for the slice that systemd creates to contain the +# bcachefsck instance units (e.g. bcachefsck@.service) and we must configure this +# slice as a dependency of the system slice to establish the direct dependency +# relation. +WantedBy=system.slice diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index bb2a0cc4..66de8c0c 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -264,36 +264,54 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") +void __bch2_print(struct bch_fs *c, const char *fmt, ...); + +#define maybe_dev_to_fs(_c) _Generic((_c), \ + struct bch_dev *: ((struct bch_dev *) (_c))->fs, \ + struct bch_fs *: (_c)) + +#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__) + +#define bch2_print_ratelimited(_c, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + \ + if (__ratelimit(&_rs)) \ + bch2_print(_c, __VA_ARGS__); \ +} while (0) + #define bch_info(c, fmt, ...) \ - printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_notice(c, fmt, ...) \ - printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn(c, fmt, ...) \ - printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn_ratelimited(c, fmt, ...) \ - printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err(c, fmt, ...) \ - printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err_dev(ca, fmt, ...) \ - printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) #define bch_err_dev_offset(ca, _offset, fmt, ...) \ - printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) #define bch_err_inum(c, _inum, fmt, ...) \ - printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) #define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ - printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) #define bch_err_ratelimited(c, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err_dev_ratelimited(ca, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) #define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) #define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) #define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) #define bch_err_fn(_c, _ret) \ do { \ @@ -446,6 +464,12 @@ enum bch_time_stats { struct btree; +struct log_output { + spinlock_t lock; + wait_queue_head_t wait; + struct printbuf buf; +}; + enum gc_phase { GC_PHASE_NOT_RUNNING, GC_PHASE_START, @@ -700,6 +724,7 @@ struct bch_fs { struct super_block *vfs_sb; dev_t dev; char name[40]; + struct log_output *output; /* ro/rw, add/remove/resize devices: */ struct rw_semaphore state_lock; diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index 43822c17..2ac6272c 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -83,6 +83,10 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2) +#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) + +#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) + /* ioctl below act on a particular file, not the filesystem as a whole: */ #define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) @@ -386,4 +390,24 @@ struct bch_ioctl_subvolume { #define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) #define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) +/* + * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command, + * but with the kernel's implementation of fsck: + */ +struct bch_ioctl_fsck_offline { + __u64 flags; + __u64 opts; /* string */ + __u64 nr_devs; + __u64 devs[0]; +}; + +/* + * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command, + * but with the kernel's implementation of fsck: + */ +struct bch_ioctl_fsck_online { + __u64 flags; + __u64 opts; /* string */ +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 47e7770d..9574c8c4 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -9,6 +9,7 @@ #include "debug.h" #include "errcode.h" #include "error.h" +#include "journal.h" #include "trace.h" #include <linux/prefetch.h> @@ -424,14 +425,11 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); - if (btree_node_dirty(b)) - bch2_btree_complete_write(c, b, btree_current_write(b)); - clear_btree_node_dirty_acct(c, b); - btree_node_data_free(c, b); } - BUG_ON(atomic_read(&c->btree_cache.dirty)); + BUG_ON(!bch2_journal_error(&c->journal) && + atomic_read(&c->btree_cache.dirty)); list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); @@ -502,19 +500,21 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc) * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) +void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; if (bc->alloc_lock == current) { - trace_and_count(c, btree_cache_cannibalize_unlock, c); + trace_and_count(c, btree_cache_cannibalize_unlock, trans); bc->alloc_lock = NULL; closure_wake_up(&bc->alloc_wait); } } -int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) +int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct task_struct *old; @@ -523,7 +523,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; if (!cl) { - trace_and_count(c, btree_cache_cannibalize_lock_fail, c); + trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; } @@ -537,11 +537,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; } - trace_and_count(c, btree_cache_cannibalize_lock_fail, c); + trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); return -BCH_ERR_btree_cache_cannibalize_lock_blocked; success: - trace_and_count(c, btree_cache_cannibalize_lock, c); + trace_and_count(c, btree_cache_cannibalize_lock, trans); return 0; } @@ -675,7 +675,7 @@ err: mutex_unlock(&bc->lock); - trace_and_count(c, btree_cache_cannibalize, c); + trace_and_count(c, btree_cache_cannibalize, trans); goto out; } @@ -751,7 +751,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, if (path && sync) bch2_trans_unlock_noassert(trans); - bch2_btree_node_read(c, b, sync); + bch2_btree_node_read(trans, b, sync); if (!sync) return NULL; @@ -1041,7 +1041,7 @@ retry: goto retry; if (IS_ERR(b) && - !bch2_btree_cache_cannibalize_lock(c, NULL)) + !bch2_btree_cache_cannibalize_lock(trans, NULL)) goto retry; if (IS_ERR(b)) @@ -1089,7 +1089,7 @@ lock_node: EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); btree_check_header(c, b); out: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return b; } diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index cfb80b20..4e1af588 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -17,8 +17,8 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); -void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); -int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); +void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); +int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 3c663c59..a6ac68fe 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1575,16 +1575,17 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool return 0; } -void bch2_btree_node_read(struct bch_fs *c, struct btree *b, +void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bool sync) { + struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct btree_read_bio *rb; struct bch_dev *ca; struct bio *bio; int ret; - trace_and_count(c, btree_node_read, c, b); + trace_and_count(c, btree_node_read, trans, b); if (bch2_verify_all_btree_replicas && !btree_node_read_all_replicas(c, b, sync)) @@ -1663,12 +1664,12 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, closure_init_stack(&cl); do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); closure_sync(&cl); } while (ret); b = bch2_btree_node_mem_alloc(trans, level != 0); - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); BUG_ON(IS_ERR(b)); @@ -1677,7 +1678,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, set_btree_node_read_in_flight(b); - bch2_btree_node_read(c, b, true); + bch2_btree_node_read(trans, b, true); if (btree_node_read_error(b)) { bch2_btree_node_hash_remove(&c->btree_cache, b); @@ -1704,8 +1705,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); } -void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, - struct btree_write *w) +static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, + struct btree_write *w) { unsigned long old, new, v = READ_ONCE(b->will_make_reachable); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 7e03dd76..e251cb6b 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -130,13 +130,10 @@ void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, struct btree *, bool, bool *); -void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); +void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); -void bch2_btree_complete_write(struct bch_fs *, struct btree *, - struct btree_write *); - bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); enum btree_write_flags { diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 4d673d47..929f33df 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -977,7 +977,7 @@ retry_all: closure_init_stack(&cl); do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); closure_sync(&cl); } while (ret); } @@ -1013,7 +1013,7 @@ retry_all: * then failed to relock a path - that's fine. */ err: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); trans->in_traverse_all = false; @@ -1298,7 +1298,7 @@ static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path { __bch2_btree_path_unlock(trans, path); btree_path_list_remove(trans, path); - trans->paths_allocated &= ~(1ULL << path->idx); + __clear_bit(path->idx, trans->paths_allocated); } void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) @@ -1471,6 +1471,7 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) { struct btree_transaction_stats *s = btree_trans_stats(trans); struct printbuf buf = PRINTBUF; + size_t nr = bitmap_weight(trans->paths_allocated, BTREE_ITER_MAX); if (!s) return; @@ -1479,9 +1480,8 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) if (!buf.allocation_failure) { mutex_lock(&s->lock); - if (s->nr_max_paths < hweight64(trans->paths_allocated)) { - s->nr_max_paths = trans->nr_max_paths = - hweight64(trans->paths_allocated); + if (nr > s->nr_max_paths) { + s->nr_max_paths = nr; swap(s->max_paths_text, buf.buf); } mutex_unlock(&s->lock); @@ -1489,7 +1489,7 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) printbuf_exit(&buf); - trans->nr_max_paths = hweight64(trans->paths_allocated); + trans->nr_max_paths = nr; } noinline __cold @@ -1518,13 +1518,12 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, struct btree_path *pos) { struct btree_path *path; - unsigned idx; + size_t idx = find_first_zero_bit(trans->paths_allocated, BTREE_ITER_MAX); - if (unlikely(trans->paths_allocated == - ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) + if (unlikely(idx == BTREE_ITER_MAX)) btree_path_overflow(trans); - idx = __ffs64(~trans->paths_allocated); + BUG_ON(idx > BTREE_ITER_MAX); /* * Do this before marking the new path as allocated, since it won't be @@ -1533,7 +1532,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, if (unlikely(idx > trans->nr_max_paths)) bch2_trans_update_max_paths(trans); - trans->paths_allocated |= 1ULL << idx; + __set_bit(idx, trans->paths_allocated); path = &trans->paths[idx]; path->idx = idx; @@ -2503,7 +2502,7 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans) struct btree_path *path; unsigned i; - BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); + BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, BTREE_ITER_MAX)); trans_for_each_path(trans, path) { BUG_ON(path->sorted_idx >= trans->nr_sorted); @@ -2513,7 +2512,7 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans) for (i = 0; i < trans->nr_sorted; i++) { unsigned idx = trans->sorted[i]; - EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); + BUG_ON(!test_bit(idx, trans->paths_allocated)); BUG_ON(trans->paths[idx].sorted_idx != i); } } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 75beb183..ea4fc8a2 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -66,17 +66,10 @@ static inline void btree_trans_sort_paths(struct btree_trans *trans) static inline struct btree_path * __trans_next_path(struct btree_trans *trans, unsigned idx) { - u64 l; - + idx = find_next_bit(trans->paths_allocated, BTREE_ITER_MAX, idx); if (idx == BTREE_ITER_MAX) return NULL; - - l = trans->paths_allocated >> idx; - if (!l) - return NULL; - - idx += __ffs64(l); - EBUG_ON(idx >= BTREE_ITER_MAX); + EBUG_ON(idx > BTREE_ITER_MAX); EBUG_ON(trans->paths[idx].idx != idx); return &trans->paths[idx]; } @@ -92,17 +85,11 @@ __trans_next_path(struct btree_trans *trans, unsigned idx) static inline struct btree_path * __trans_next_path_safe(struct btree_trans *trans, unsigned *idx) { - u64 l; - + *idx = find_next_bit(trans->paths_allocated, BTREE_ITER_MAX, *idx); if (*idx == BTREE_ITER_MAX) return NULL; - l = trans->paths_allocated >> *idx; - if (!l) - return NULL; - - *idx += __ffs64(l); - EBUG_ON(*idx >= BTREE_ITER_MAX); + EBUG_ON(*idx > BTREE_ITER_MAX); return &trans->paths[*idx]; } @@ -631,7 +618,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *); static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) + if (bitmap_weight(trans->paths_allocated, BTREE_ITER_MAX) > BTREE_ITER_MAX - 8) return __bch2_btree_trans_too_many_iters(trans); return 0; diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index c5e8a461..b39b28b4 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -997,8 +997,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) list_for_each_entry_safe(ck, n, &items, list) { cond_resched(); - bch2_journal_pin_drop(&c->journal, &ck->journal); - list_del(&ck->list); kfree(ck->k); six_lock_exit(&ck->c.lock); diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 11b0a2c8..a49f1dd1 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -263,7 +263,7 @@ static inline int btree_node_lock(struct btree_trans *trans, int ret = 0; EBUG_ON(level >= BTREE_MAX_DEPTH); - EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + EBUG_ON(!test_bit(path->idx, trans->paths_allocated)); if (likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index ca752660..78d9f585 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -414,7 +414,7 @@ struct btree_trans { unsigned extra_journal_res; unsigned nr_max_paths; - u64 paths_allocated; + unsigned long paths_allocated[BITS_TO_LONGS(BTREE_ITER_MAX)]; unsigned mem_top; unsigned mem_max; diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index ba42f578..254794c1 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -531,6 +531,19 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); } +static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) +{ + struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k)); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_copy(n, k); + return bch2_btree_insert_trans(trans, btree, n, 0); +} + int __must_check bch2_trans_update_buffered(struct btree_trans *trans, enum btree_id btree, struct bkey_i *k) @@ -541,6 +554,9 @@ int __must_check bch2_trans_update_buffered(struct btree_trans *trans, EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + if (unlikely(trans->journal_replay_not_finished)) + return bch2_btree_insert_clone_trans(trans, btree, k); + trans_for_each_wb_update(trans, i) { if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { bkey_copy(&i->k, k); diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index c9f07ca4..970faec1 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -163,9 +163,11 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, /* Btree node freeing/allocation: */ -static void __btree_node_free(struct bch_fs *c, struct btree *b) +static void __btree_node_free(struct btree_trans *trans, struct btree *b) { - trace_and_count(c, btree_node_free, c, b); + struct bch_fs *c = trans->c; + + trace_and_count(c, btree_node_free, trans, b); BUG_ON(btree_node_write_blocked(b)); BUG_ON(btree_node_dirty(b)); @@ -191,7 +193,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, bch2_btree_node_lock_write_nofail(trans, path, &b->c); bch2_btree_node_hash_remove(&c->btree_cache, b); - __btree_node_free(c, b); + __btree_node_free(trans, b); six_unlock_write(&b->c.lock); mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); @@ -362,7 +364,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); BUG_ON(ret); - trace_and_count(c, btree_node_alloc, c, b); + trace_and_count(c, btree_node_alloc, trans, b); bch2_increment_clock(c, btree_sectors(c), WRITE); return b; } @@ -452,7 +454,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - __btree_node_free(c, b); + __btree_node_free(trans, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } @@ -465,7 +467,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, unsigned flags, struct closure *cl) { - struct bch_fs *c = as->c; struct btree *b; unsigned interior; int ret = 0; @@ -476,7 +477,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, * Protects reaping from the btree node cache and using the btree node * open bucket reserve: */ - ret = bch2_btree_cache_cannibalize_lock(c, cl); + ret = bch2_btree_cache_cannibalize_lock(trans, cl); if (ret) return ret; @@ -495,7 +496,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, } } err: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return ret; } @@ -1067,6 +1068,17 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; + if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) && + watermark < c->journal.watermark) { + struct journal_res res = { 0 }; + + ret = drop_locks_do(trans, + bch2_journal_res_get(&c->journal, &res, 1, + watermark|JOURNAL_RES_GET_CHECK)); + if (ret) + return ERR_PTR(ret); + } + while (1) { nr_nodes[!!update_level] += 1 + split; update_level++; @@ -1211,7 +1223,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct bch_fs *c = as->c; struct btree *old; - trace_and_count(c, btree_node_set_root, c, b); + trace_and_count(c, btree_node_set_root, trans, b); old = btree_node_root(c, b); @@ -1465,7 +1477,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { struct btree *n[2]; - trace_and_count(c, btree_node_split, c, b); + trace_and_count(c, btree_node_split, trans, b); n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); @@ -1523,7 +1535,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); } } else { - trace_and_count(c, btree_node_compact, c, b); + trace_and_count(c, btree_node_compact, trans, b); n1 = bch2_btree_node_alloc_replacement(as, trans, b); @@ -1843,7 +1855,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (ret) goto err; - trace_and_count(c, btree_node_merge, c, b); + trace_and_count(c, btree_node_merge, trans, b); bch2_btree_interior_update_will_free_node(as, b); bch2_btree_interior_update_will_free_node(as, m); @@ -1946,7 +1958,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, new_path, n); - trace_and_count(c, btree_node_rewrite, c, b); + trace_and_count(c, btree_node_rewrite, trans, b); if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); @@ -2228,7 +2240,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite * btree_iter_traverse(): */ if (btree_ptr_hash_val(new_key) != b->hash_val) { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); if (ret) { ret = drop_locks_do(trans, (closure_sync(&cl), 0)); if (ret) @@ -2252,7 +2264,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite six_unlock_intent(&new_hash->c.lock); } closure_sync(&cl); - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return ret; } @@ -2313,12 +2325,12 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) closure_init_stack(&cl); do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); closure_sync(&cl); } while (ret); b = bch2_btree_node_mem_alloc(trans, false); - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); set_btree_node_fake(b); set_btree_node_need_rewrite(b); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 312bd0c8..27c74388 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1164,107 +1164,6 @@ int bch2_mark_reservation(struct btree_trans *trans, return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags); } -static s64 __bch2_mark_reflink_p(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 start, u64 end, - u64 *idx, unsigned flags, size_t r_idx) -{ - struct bch_fs *c = trans->c; - struct reflink_gc *r; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - u64 next_idx = end; - s64 ret = 0; - struct printbuf buf = PRINTBUF; - - if (r_idx >= c->reflink_gc_nr) - goto not_found; - - r = genradix_ptr(&c->reflink_gc_table, r_idx); - next_idx = min(next_idx, r->offset - r->size); - if (*idx < next_idx) - goto not_found; - - BUG_ON((s64) r->refcount + add < 0); - - r->refcount += add; - *idx = r->offset; - return 0; -not_found: - if (fsck_err(c, reflink_p_to_missing_reflink_v, - "pointer to missing indirect extent\n" - " %s\n" - " missing range %llu-%llu", - (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), - *idx, next_idx)) { - struct bkey_i_error *new; - - new = bch2_trans_kmalloc(trans, sizeof(*new)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - bkey_init(&new->k); - new->k.type = KEY_TYPE_error; - new->k.p = bkey_start_pos(p.k); - new->k.p.offset += *idx - start; - bch2_key_resize(&new->k, next_idx - *idx); - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, - BTREE_TRIGGER_NORUN); - } - - *idx = next_idx; -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int __mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - struct reflink_gc *ref; - size_t l, r, m; - u64 idx = le64_to_cpu(p.v->idx), start = idx; - u64 end = le64_to_cpu(p.v->idx) + p.k->size; - int ret = 0; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) { - idx -= le32_to_cpu(p.v->front_pad); - end += le32_to_cpu(p.v->back_pad); - } - - l = 0; - r = c->reflink_gc_nr; - while (l < r) { - m = l + (r - l) / 2; - - ref = genradix_ptr(&c->reflink_gc_table, m); - if (ref->offset <= idx) - l = m + 1; - else - r = m; - } - - while (idx < end && !ret) - ret = __bch2_mark_reflink_p(trans, p, start, end, - &idx, flags, l++); - - return ret; -} - -int bch2_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags); -} - void bch2_trans_fs_usage_revert(struct btree_trans *trans, struct replicas_delta_list *deltas) { @@ -1732,105 +1631,6 @@ int bch2_trans_mark_reservation(struct btree_trans *trans, return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags); } -static int trans_mark_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 *idx, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i *k; - __le64 *refcount; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - struct printbuf buf = PRINTBUF; - int ret; - - k = bch2_bkey_get_mut_noupdate(trans, &iter, - BTREE_ID_reflink, POS(0, *idx), - BTREE_ITER_WITH_UPDATES); - ret = PTR_ERR_OR_ZERO(k); - if (ret) - goto err; - - refcount = bkey_refcount(k); - if (!refcount) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "nonexistent indirect extent at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; - } - - if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "indirect extent refcount underflow at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; - } - - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; - u64 pad; - - pad = max_t(s64, le32_to_cpu(v->front_pad), - le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); - BUG_ON(pad > U32_MAX); - v->front_pad = cpu_to_le32(pad); - - pad = max_t(s64, le32_to_cpu(v->back_pad), - k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); - BUG_ON(pad > U32_MAX); - v->back_pad = cpu_to_le32(pad); - } - - le64_add_cpu(refcount, add); - - bch2_btree_iter_set_pos_to_extent_start(&iter); - ret = bch2_trans_update(trans, &iter, k, 0); - if (ret) - goto err; - - *idx = k->k.p.offset; -err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -} - -static int __trans_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) -{ - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx, end_idx; - int ret = 0; - - idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); - end_idx = le64_to_cpu(p.v->idx) + p.k->size + - le32_to_cpu(p.v->back_pad); - - while (idx < end_idx && !ret) - ret = trans_mark_reflink_p_segment(trans, p, &idx, flags); - return ret; -} - -int bch2_trans_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) -{ - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v; - - v->front_pad = v->back_pad = 0; - } - - return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags); -} - static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, size_t b, enum bch_data_type type, diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index bc088673..379101d7 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -335,14 +335,10 @@ int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s_c, unsigned); int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); - #define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ ({ \ int ret = 0; \ diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index ba0436ae..a042e07c 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -29,6 +29,63 @@ static int copy_to_user_errcode(void __user *to, const void *from, unsigned long return copy_to_user(to, from, n) ? -EFAULT : 0; } +struct thread_with_file { + struct task_struct *task; + int ret; +}; + +static void thread_with_file_exit(struct thread_with_file *thr) +{ + if (thr->task) { + kthread_stop(thr->task); + put_task_struct(thr->task); + } +} + +static int run_thread_with_file(struct thread_with_file *thr, + const struct file_operations *fops, + int (*fn)(void *), const char *fmt, ...) +{ + va_list args; + struct file *file = NULL; + int ret, fd = -1; + struct printbuf name = PRINTBUF; + unsigned fd_flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; + + va_start(args, fmt); + prt_vprintf(&name, fmt, args); + va_end(args); + + thr->ret = 0; + thr->task = kthread_create(fn, thr, name.buf); + ret = PTR_ERR_OR_ZERO(thr->task); + if (ret) + goto err; + + ret = get_unused_fd_flags(fd_flags); + if (ret < 0) + goto err_stop_task; + fd = ret; + + file = anon_inode_getfile(name.buf, fops, thr, fd_flags); + ret = PTR_ERR_OR_ZERO(file); + if (ret) + goto err_put_fd; + + fd_install(fd, file); + get_task_struct(thr->task); + wake_up_process(thr->task); + printbuf_exit(&name); + return fd; +err_put_fd: + put_unused_fd(fd); +err_stop_task: + kthread_stop(thr->task); +err: + printbuf_exit(&name); + return ret; +} + /* returns with ref on ca->ref */ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, unsigned flags) @@ -138,8 +195,177 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg } #endif +struct fsck_thread { + struct thread_with_file thr; + struct printbuf buf; + struct bch_fs *c; + char **devs; + size_t nr_devs; + struct bch_opts opts; + + struct log_output output; + DARRAY(char) output2; +}; + +static void bch2_fsck_thread_free(struct fsck_thread *thr) +{ + thread_with_file_exit(&thr->thr); + if (thr->devs) + for (size_t i = 0; i < thr->nr_devs; i++) + kfree(thr->devs[i]); + darray_exit(&thr->output2); + printbuf_exit(&thr->output.buf); + kfree(thr->devs); + kfree(thr); +} + +static int bch2_fsck_thread_release(struct inode *inode, struct file *file) +{ + struct fsck_thread *thr = container_of(file->private_data, struct fsck_thread, thr); + + bch2_fsck_thread_free(thr); + return 0; +} + +static ssize_t bch2_fsck_thread_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct fsck_thread *thr = container_of(file->private_data, struct fsck_thread, thr); + size_t copied = 0, b; + int ret = 0; + + ret = wait_event_interruptible(thr->output.wait, + thr->output.buf.pos || thr->output2.nr); + if (ret) + return ret; + + while (len) { + ret = darray_make_room(&thr->output2, thr->output.buf.pos); + if (ret) + break; + + spin_lock_irq(&thr->output.lock); + b = min_t(size_t, darray_room(thr->output2), thr->output.buf.pos); + + memcpy(&darray_top(thr->output2), thr->output.buf.buf, b); + memmove(thr->output.buf.buf, + thr->output.buf.buf + b, + thr->output.buf.pos - b); + + thr->output2.nr += b; + thr->output.buf.pos -= b; + spin_unlock_irq(&thr->output.lock); + + b = min(len, thr->output2.nr); + if (!b) + break; + + b -= copy_to_user(buf, thr->output2.data, b); + if (!b) { + ret = -EFAULT; + break; + } + + copied += b; + buf += b; + len -= b; + + memmove(thr->output2.data, + thr->output2.data + b, + thr->output2.nr - b); + thr->output2.nr -= b; + } + + return copied ?: ret; +} + +static const struct file_operations fsck_thread_ops = { + .release = bch2_fsck_thread_release, + .read = bch2_fsck_thread_read, + .llseek = no_llseek, +}; + +static int bch2_fsck_offline_thread_fn(void *arg) +{ + struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); + + thr->thr.ret = PTR_ERR_OR_ZERO(c); + if (!thr->thr.ret) + bch2_fs_stop(c); + return 0; +} + +static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) +{ + struct bch_ioctl_fsck_offline arg; + struct fsck_thread *thr = NULL; + u64 *devs = NULL; + long ret = 0; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags) + return -EINVAL; + + if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) || + !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) || + !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) { + ret = -ENOMEM; + goto err; + } + + thr->nr_devs = arg.nr_devs; + thr->output.buf = PRINTBUF; + thr->output.buf.atomic++; + spin_lock_init(&thr->output.lock); + init_waitqueue_head(&thr->output.wait); + darray_init(&thr->output2); + + if (copy_from_user(devs, &user_arg->devs[0], sizeof(user_arg->devs[0]) * arg.nr_devs)) { + ret = -EINVAL; + goto err; + } + + for (size_t i = 0; i < arg.nr_devs; i++) { + thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX); + ret = PTR_ERR_OR_ZERO(thr->devs[i]); + if (ret) + goto err; + } + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(NULL, &thr->opts, optstr); + kfree(optstr); + + if (ret) + goto err; + } + + opt_set(thr->opts, log_output, (u64)(unsigned long)&thr->output); + + ret = run_thread_with_file(&thr->thr, + &fsck_thread_ops, + bch2_fsck_offline_thread_fn, + "bch-fsck"); +err: + if (ret < 0) { + if (thr) + bch2_fsck_thread_free(thr); + pr_err("ret %s", bch2_err_str(ret)); + } + kfree(devs); + return ret; +} + static long bch2_global_ioctl(unsigned cmd, void __user *arg) { + long ret; + switch (cmd) { #if 0 case BCH_IOCTL_ASSEMBLE: @@ -147,9 +373,18 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg) case BCH_IOCTL_INCREMENTAL: return bch2_ioctl_incremental(arg); #endif + case BCH_IOCTL_FSCK_OFFLINE: { + ret = bch2_ioctl_fsck_offline(arg); + break; + } default: - return -ENOTTY; + ret = -ENOTTY; + break; } + + if (ret < 0) + ret = bch2_err_class(ret); + return ret; } static long bch2_ioctl_query_uuid(struct bch_fs *c, @@ -299,31 +534,27 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, } struct bch_data_ctx { + struct thread_with_file thr; + struct bch_fs *c; struct bch_ioctl_data arg; struct bch_move_stats stats; - - int ret; - - struct task_struct *thread; }; static int bch2_data_thread(void *arg) { - struct bch_data_ctx *ctx = arg; - - ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); + struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); + ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); ctx->stats.data_type = U8_MAX; return 0; } static int bch2_data_job_release(struct inode *inode, struct file *file) { - struct bch_data_ctx *ctx = file->private_data; + struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - kthread_stop(ctx->thread); - put_task_struct(ctx->thread); + thread_with_file_exit(&ctx->thr); kfree(ctx); return 0; } @@ -331,7 +562,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file) static ssize_t bch2_data_job_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { - struct bch_data_ctx *ctx = file->private_data; + struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); struct bch_fs *c = ctx->c; struct bch_ioctl_data_event e = { .type = BCH_DATA_EVENT_PROGRESS, @@ -357,10 +588,8 @@ static const struct file_operations bcachefs_data_ops = { static long bch2_ioctl_data(struct bch_fs *c, struct bch_ioctl_data arg) { - struct bch_data_ctx *ctx = NULL; - struct file *file = NULL; - unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; - int ret, fd = -1; + struct bch_data_ctx *ctx; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -375,36 +604,12 @@ static long bch2_ioctl_data(struct bch_fs *c, ctx->c = c; ctx->arg = arg; - ctx->thread = kthread_create(bch2_data_thread, ctx, - "bch-data/%s", c->name); - if (IS_ERR(ctx->thread)) { - ret = PTR_ERR(ctx->thread); - goto err; - } - - ret = get_unused_fd_flags(flags); + ret = run_thread_with_file(&ctx->thr, + &bcachefs_data_ops, + bch2_data_thread, + "bch-data/%s", c->name); if (ret < 0) - goto err; - fd = ret; - - file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto err; - } - - fd_install(fd, file); - - get_task_struct(ctx->thread); - wake_up_process(ctx->thread); - - return fd; -err: - if (fd >= 0) - put_unused_fd(fd); - if (!IS_ERR_OR_NULL(ctx->thread)) - kthread_stop(ctx->thread); - kfree(ctx); + kfree(ctx); return ret; } @@ -690,6 +895,50 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, return ret; } +static int bch2_fsck_online_thread_fn(void *arg) +{ + struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct bch_fs *c = thr->c; +#if 0 + struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); + + thr->thr.ret = PTR_ERR_OR_ZERO(c); + if (!thr->thr.ret) + bch2_fs_stop(c); +#endif + return 0; +} + +static long bch2_ioctl_fsck_online(struct bch_fs *c, + struct bch_ioctl_fsck_online arg) +{ + struct fsck_thread *thr = NULL; + long ret = 0; + + if (arg.flags) + return -EINVAL; + + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) + return -ENOMEM; + + thr->c = c; + thr->output.buf = PRINTBUF; + thr->output.buf.atomic++; + spin_lock_init(&thr->output.lock); + init_waitqueue_head(&thr->output.wait); + darray_init(&thr->output2); + + ret = run_thread_with_file(&thr->thr, + &fsck_thread_ops, + bch2_fsck_online_thread_fn, + "bch-fsck"); + bch_err_fn(c, ret); + if (ret < 0) + bch2_fsck_thread_free(thr); + return ret; +} + #define BCH_IOCTL(_name, _argtype) \ do { \ _argtype i; \ @@ -745,7 +994,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); case BCH_IOCTL_DISK_RESIZE_JOURNAL: BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); - + case BCH_IOCTL_FSCK_ONLINE: + BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); default: return -ENOTTY; } diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 51af8ea2..33df8cf8 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -572,10 +572,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), c->opts.encoded_extent_max); - /* - * ZSTD is lying: if we allocate the size of the workspace it says it - * requires, it returns memory allocation errors - */ c->zstd_workspace_size = zstd_cctx_workspace_bound(¶ms.cParams); struct { diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 0d58a872..22d4bb78 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -485,7 +485,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, * we aren't using the extent overwrite path to delete, we're * just using the normal key deletion path: */ - if (bkey_deleted(&n->k)) + if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS)) n->k.size = 0; return bch2_trans_relock(trans) ?: @@ -605,7 +605,7 @@ int bch2_data_update_init(struct btree_trans *trans, m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, data_opts); + ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); goto done; } diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index e3e2be79..87c13f13 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -174,6 +174,7 @@ x(EINVAL, insufficient_devices_to_start) \ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ + x(EINVAL, opt_parse_error) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 5a39bcb5..561fc1da 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -287,34 +287,26 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) bch_notice(c, "shutdown by ioctl type %u", flags); - down_write(&c->vfs_sb->s_umount); - switch (flags) { case FSOP_GOING_FLAGS_DEFAULT: ret = freeze_bdev(c->vfs_sb->s_bdev); if (ret) - goto err; - + break; bch2_journal_flush(&c->journal); - c->vfs_sb->s_flags |= SB_RDONLY; bch2_fs_emergency_read_only(c); thaw_bdev(c->vfs_sb->s_bdev); break; - case FSOP_GOING_FLAGS_LOGFLUSH: bch2_journal_flush(&c->journal); fallthrough; - case FSOP_GOING_FLAGS_NOLOGFLUSH: - c->vfs_sb->s_flags |= SB_RDONLY; bch2_fs_emergency_read_only(c); break; default: ret = -EINVAL; break; } -err: - up_write(&c->vfs_sb->s_umount); + return ret; } @@ -341,6 +333,10 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) return -EINVAL; + if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + !arg.src_ptr) + return -EOPNOTSUPP; + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) create_flags |= BCH_CREATE_SNAPSHOT; diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 8dd4046c..8e6f230e 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -279,14 +279,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) if (err) prt_printf(err, "%s: not a multiple of 512", opt->attr.name); - return -EINVAL; + return -BCH_ERR_opt_parse_error; } if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { if (err) prt_printf(err, "%s: must be a power of two", opt->attr.name); - return -EINVAL; + return -BCH_ERR_opt_parse_error; } if (opt->fn.validate) diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 8526f177..91026dfb 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -419,6 +419,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Allocate the buckets_nouse bitmap") \ + x(log_output, u64, \ + 0, \ + OPT_UINT(0, S64_MAX), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Allocate the buckets_nouse bitmap") \ x(project, u8, \ OPT_INODE, \ OPT_BOOL(), \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 98f1454c..69b49845 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -171,10 +171,12 @@ static int bch2_journal_replay(struct bch_fs *c) struct journal_key *k = keys->d + i; - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_journal_reclaim| - (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), + /* Skip fastpath if we're low on space in the journal */ + ret = c->journal.watermark ? -1 : + commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_journal_reclaim| + (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), bch2_journal_replay_key(trans, k)); BUG_ON(!ret && !k->overwritten); if (ret) { @@ -657,13 +659,13 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) struct recovery_pass_fn *p = recovery_pass_fns + pass; if (!(p->when & PASS_SILENT)) - printk(KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); + bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); ret = p->fn(c); if (ret) return ret; if (!(p->when & PASS_SILENT)) - printk(KERN_CONT " done\n"); + bch2_print(c, KERN_CONT " done\n"); c->recovery_passes_complete |= BIT_ULL(pass); } diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 07ddf3e8..1d56470e 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -3,6 +3,7 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "error.h" #include "extents.h" #include "inode.h" #include "io_misc.h" @@ -73,6 +74,206 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r return true; } +static int trans_mark_reflink_p_segment(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i *k; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + struct printbuf buf = PRINTBUF; + int ret; + + k = bch2_bkey_get_mut_noupdate(trans, &iter, + BTREE_ID_reflink, POS(0, *idx), + BTREE_ITER_WITH_UPDATES); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + refcount = bkey_refcount(k); + if (!refcount) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, + "nonexistent indirect extent at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, + "indirect extent refcount underflow at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; + u64 pad; + + pad = max_t(s64, le32_to_cpu(v->front_pad), + le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); + BUG_ON(pad > U32_MAX); + v->front_pad = cpu_to_le32(pad); + + pad = max_t(s64, le32_to_cpu(v->back_pad), + k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); + BUG_ON(pad > U32_MAX); + v->back_pad = cpu_to_le32(pad); + } + + le64_add_cpu(refcount, add); + + bch2_btree_iter_set_pos_to_extent_start(&iter); + ret = bch2_trans_update(trans, &iter, k, 0); + if (ret) + goto err; + + *idx = k->k.p.offset; +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +static int __trans_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx, end_idx; + int ret = 0; + + idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); + end_idx = le64_to_cpu(p.v->idx) + p.k->size + + le32_to_cpu(p.v->back_pad); + + while (idx < end_idx && !ret) + ret = trans_mark_reflink_p_segment(trans, p, &idx, flags); + return ret; +} + +int bch2_trans_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v; + + v->front_pad = v->back_pad = 0; + } + + return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags); +} + +static s64 __bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 start, u64 end, + u64 *idx, unsigned flags, size_t r_idx) +{ + struct bch_fs *c = trans->c; + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + u64 next_idx = end; + s64 ret = 0; + struct printbuf buf = PRINTBUF; + + if (r_idx >= c->reflink_gc_nr) + goto not_found; + + r = genradix_ptr(&c->reflink_gc_table, r_idx); + next_idx = min(next_idx, r->offset - r->size); + if (*idx < next_idx) + goto not_found; + + BUG_ON((s64) r->refcount + add < 0); + + r->refcount += add; + *idx = r->offset; + return 0; +not_found: + if (fsck_err(c, reflink_p_to_missing_reflink_v, + "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", + (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), + *idx, next_idx)) { + struct bkey_i_error *new; + + new = bch2_trans_kmalloc(trans, sizeof(*new)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + bkey_init(&new->k); + new->k.type = KEY_TYPE_error; + new->k.p = bkey_start_pos(p.k); + new->k.p.offset += *idx - start; + bch2_key_resize(&new->k, next_idx - *idx); + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, + BTREE_TRIGGER_NORUN); + } + + *idx = next_idx; +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int __mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; + size_t l, r, m; + u64 idx = le64_to_cpu(p.v->idx), start = idx; + u64 end = le64_to_cpu(p.v->idx) + p.k->size; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) { + idx -= le32_to_cpu(p.v->front_pad); + end += le32_to_cpu(p.v->back_pad); + } + + l = 0; + r = c->reflink_gc_nr; + while (l < r) { + m = l + (r - l) / 2; + + ref = genradix_ptr(&c->reflink_gc_table, m); + if (ref->offset <= idx) + l = m + 1; + else + r = m; + } + + while (idx < end && !ret) + ret = __bch2_mark_reflink_p(trans, p, start, end, + &idx, flags, l++); + + return ret; +} + +int bch2_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags); +} + /* indirect extents */ int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -121,6 +322,14 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans, { check_indirect_extent_deleting(new, &flags); + if (old.k->type == KEY_TYPE_reflink_v && + new->k.type == KEY_TYPE_reflink_v && + old.k->u64s == new->k.u64s && + !memcmp(bkey_s_c_to_reflink_v(old).v->start, + bkey_i_to_reflink_v(new)->v.start, + bkey_val_bytes(&new->k) - 8)) + return 0; + return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); } diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h index 8ccf3f9c..6cc9c4a7 100644 --- a/libbcachefs/reflink.h +++ b/libbcachefs/reflink.h @@ -9,6 +9,10 @@ int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); +int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); #define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ .key_invalid = bch2_reflink_p_invalid, \ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index e7f186b4..3abccdbf 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -80,6 +80,25 @@ const char * const bch2_fs_flag_strs[] = { NULL }; +void __bch2_print(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + if (likely(!c->output)) { + vprintk(fmt, args); + } else { + unsigned long flags; + + spin_lock_irqsave(&c->output->lock, flags); + prt_vprintf(&c->output->buf, fmt, args); + spin_unlock_irqrestore(&c->output->lock, flags); + + wake_up(&c->output->wait); + } + va_end(args); +} + #define KTYPE(type) \ static const struct attribute_group type ## _group = { \ .attrs = type ## _files \ @@ -703,6 +722,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto out; } + c->output = (void *)(unsigned long) opts.log_output; + __module_get(THIS_MODULE); closure_init(&c->cl, NULL); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 4a7c93bc..1b82a3a9 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -278,8 +278,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (!btree_type_has_ptrs(id)) continue; - for_each_btree_key(trans, iter, id, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = for_each_btree_key2(trans, iter, id, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS, k, ({ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_extent_crc_unpacked crc; const union bch_extent_entry *entry; @@ -305,8 +305,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c s[t].sectors_compressed += k.k->size; s[t].sectors_uncompressed += k.k->size; } - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } bch2_trans_put(trans); diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 6e2ad6f3..cfa7ee78 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(trans_str, __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str)) ); -DECLARE_EVENT_CLASS(btree_node, +DECLARE_EVENT_CLASS(btree_node_nofs, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b), @@ -97,6 +97,33 @@ DECLARE_EVENT_CLASS(btree_node, __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); +DECLARE_EVENT_CLASS(btree_node, + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + __field(u8, level ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->level = b->c.level; + __entry->btree_id = b->c.btree_id; + TRACE_BPOS_assign(pos, b->key.k.p); + ), + + TP_printk("%d,%d %s %u %s %llu:%llu:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, + __entry->level, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) +); + DECLARE_EVENT_CLASS(bch_fs, TP_PROTO(struct bch_fs *c), TP_ARGS(c), @@ -112,6 +139,23 @@ DECLARE_EVENT_CLASS(bch_fs, TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) ); +DECLARE_EVENT_CLASS(btree_trans, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + ), + + TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn) +); + DECLARE_EVENT_CLASS(bio, TP_PROTO(struct bio *bio), TP_ARGS(bio), @@ -330,36 +374,36 @@ TRACE_EVENT(btree_cache_scan, __entry->nr_to_scan, __entry->can_free, __entry->ret) ); -DEFINE_EVENT(btree_node, btree_cache_reap, +DEFINE_EVENT(btree_node_nofs, btree_cache_reap, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); /* Btree */ DEFINE_EVENT(btree_node, btree_node_read, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); TRACE_EVENT(btree_node_write, @@ -383,13 +427,13 @@ TRACE_EVENT(btree_node_write, ); DEFINE_EVENT(btree_node, btree_node_alloc, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_free, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); TRACE_EVENT(btree_reserve_get_fail, @@ -421,28 +465,28 @@ TRACE_EVENT(btree_reserve_get_fail, ); DEFINE_EVENT(btree_node, btree_node_compact, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_merge, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_split, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_rewrite, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_set_root, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); TRACE_EVENT(btree_path_relock_fail, |